debian: Reimplement lister using new Lister API

Port debian lister to `swh.lister.pattern.Lister` API.

The new implementation will produce one instance of ListedOrigin model
per package, notably containing the set of parameters expected by the
debian loader.

The lister is also stateful, meaning only new packages and those with
new found versions since the last listing will be returned.

Closes T2979
This commit is contained in:
Antoine Lambert 2021-01-20 14:17:15 +01:00
parent 6cd31769c1
commit bb0184c004
15 changed files with 732 additions and 787 deletions

View file

@ -1,76 +1,16 @@
# Copyright (C) 2019 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, List, Mapping
logger = logging.getLogger(__name__)
def debian_init(
db_engine,
override_conf: Mapping[str, Any] = {},
distribution_name: str = "Debian",
suites: List[str] = ["stretch", "buster", "bullseye"],
components: List[str] = ["main", "contrib", "non-free"],
):
"""Initialize the debian data model.
Args:
db_engine: SQLAlchemy manipulation database object
override_conf: Override conf to pass to instantiate a lister
distribution_name: Distribution to initialize
suites: Default suites to register with the lister
components: Default components to register per suite
"""
from sqlalchemy.orm import sessionmaker
from swh.lister.debian.models import Area, Distribution
db_session = sessionmaker(bind=db_engine)()
distrib = (
db_session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
if distrib is None:
distrib = Distribution(
name=distribution_name,
type="deb",
mirror_uri="http://deb.debian.org/debian/",
)
db_session.add(distrib)
# Check the existing
existing_area = db_session.query(Area).filter(Area.distribution == distrib).all()
existing_area = set([a.name for a in existing_area])
logger.debug("Area already known: %s", ", ".join(existing_area))
# Create only the new ones
for suite in suites:
for component in components:
area_name = f"{suite}/{component}"
if area_name in existing_area:
logger.debug("Area '%s' already set, skipping", area_name)
continue
area = Area(name=area_name, distribution=distrib)
db_session.add(area)
db_session.commit()
db_session.close()
from typing import Any, Mapping
def register() -> Mapping[str, Any]:
from .lister import DebianLister
return {
"models": [DebianLister.MODEL],
"models": [],
"lister": DebianLister,
"task_modules": ["%s.tasks" % __name__],
"init": debian_init,
}

View file

@ -1,260 +1,287 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# Copyright (C) 2017-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import bz2
from collections import defaultdict
import datetime
from dataclasses import dataclass, field
import gzip
from itertools import product
import logging
import lzma
from typing import Any, Dict, Mapping, Optional
from typing import Any, Callable, Dict, Iterator, List, Set, Tuple
from urllib.parse import urljoin
from debian.deb822 import Sources
from requests import Response
from sqlalchemy.orm import joinedload, load_only
from sqlalchemy.schema import CreateTable, DropTable
import requests
from swh.lister.core.lister_base import FetchError, ListerBase
from swh.lister.core.lister_transports import ListerHttpTransport
from swh.lister.debian.models import (
AreaSnapshot,
Distribution,
DistributionSnapshot,
Package,
TempPackage,
)
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
decompressors = {
from .. import USER_AGENT
from ..pattern import Lister
logger = logging.getLogger(__name__)
decompressors: Dict[str, Callable[[Any], Any]] = {
"gz": lambda f: gzip.GzipFile(fileobj=f),
"bz2": bz2.BZ2File,
"xz": lzma.LZMAFile,
}
logger = logging.getLogger(__name__)
Suite = str
Component = str
PkgName = str
PkgVersion = str
DebianOrigin = str
DebianPageType = Iterator[Sources]
class DebianLister(ListerHttpTransport, ListerBase):
MODEL = Package
PATH_TEMPLATE = None
@dataclass
class DebianListerState:
"""State of debian lister"""
package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
"""Dictionary mapping a package name to all the versions found during
last listing"""
class DebianLister(Lister[DebianListerState, DebianPageType]):
"""
List source packages for a given debian or derivative distribution.
The lister will create a snapshot for each package name from all its
available versions.
If a package snapshot is different from the last listing operation,
it will be send to the scheduler that will create a loading task
to archive newly found source code.
Args:
scheduler: instance of SchedulerInterface
distribution: identifier of listed distribution (e.g. Debian, Ubuntu)
mirror_url: debian package archives mirror URL
suites: list of distribution suites to process
components: list of package components to process
"""
LISTER_NAME = "debian"
instance = "debian"
def __init__(
self,
scheduler: SchedulerInterface,
distribution: str = "Debian",
date: Optional[datetime.datetime] = None,
override_config: Mapping = {},
mirror_url: str = "http://deb.debian.org/debian/",
suites: List[Suite] = ["stretch", "buster", "bullseye"],
components: List[Component] = ["main", "contrib", "non-free"],
):
"""Initialize the debian lister for a given distribution at a given
date.
Args:
distribution: name of the distribution (e.g. "Debian")
date: date the snapshot is taken (defaults to now if empty)
override_config: Override configuration (which takes precedence
over the parameters if provided)
"""
ListerHttpTransport.__init__(self, url="notused")
ListerBase.__init__(self, override_config=override_config)
self.distribution = override_config.get("distribution", distribution)
self.date = override_config.get("date", date) or datetime.datetime.now(
tz=datetime.timezone.utc
super().__init__(
scheduler=scheduler, url=mirror_url, instance=distribution,
)
def transport_request(self, identifier) -> Response:
"""Subvert ListerHttpTransport.transport_request, to try several
index URIs in turn.
# to ensure urljoin will produce valid Sources URL
if not self.url.endswith("/"):
self.url += "/"
The Debian repository format supports several compression algorithms
across the ages, so we try several URIs.
self.distribution = distribution
self.suites = suites
self.components = components
Once we have found a working URI, we break and set `self.decompressor`
to the one that matched.
self.session = requests.Session()
self.session.headers.update({"User-Agent": USER_AGENT})
Returns:
a requests Response object.
# will hold all listed origins info
self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {}
# will contain origin urls that have already been listed
# in a previous page
self.sent_origins: Set[DebianOrigin] = set()
# will contain already listed package info that need to be sent
# to the scheduler for update in the commit_page method
self.origins_to_update: Dict[DebianOrigin, ListedOrigin] = {}
# will contain the lister state after a call to run
self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
Raises:
FetchError: when all the URIs failed to be retrieved.
"""
response = None
compression = None
def state_from_dict(self, d: Dict[str, Any]) -> DebianListerState:
return DebianListerState(package_versions={k: set(v) for k, v in d.items()})
for uri, compression in self.area.index_uris():
response = super().transport_request(uri)
def state_to_dict(self, state: DebianListerState) -> Dict[str, Any]:
return {k: list(v) for k, v in state.package_versions.items()}
def debian_index_urls(
self, suite: Suite, component: Component
) -> Iterator[Tuple[str, str]]:
"""Return an iterator on possible Sources file URLs as multiple compression
formats can be used."""
compression_exts = ("xz", "bz2", "gz")
base_url = urljoin(self.url, f"dists/{suite}/{component}/source/Sources")
for ext in compression_exts:
yield (f"{base_url}.{ext}", ext)
yield (base_url, "")
def page_request(self, suite: Suite, component: Component) -> DebianPageType:
"""Return parsed package Sources file for a given debian suite and component."""
for url, compression in self.debian_index_urls(suite, component):
response = requests.get(url, stream=True)
logging.debug("Fetched URL: %s, status code: %s", url, response.status_code)
if response.status_code == 200:
break
else:
raise FetchError("Could not retrieve index for %s" % self.area)
self.decompressor = decompressors.get(compression)
return response
raise Exception(
"Could not retrieve sources index for %s/%s", suite, component
)
def request_uri(self, identifier):
# In the overridden transport_request, we pass
# ListerBase.transport_request() the full URI as identifier, so we
# need to return it here.
return identifier
def request_params(self, identifier) -> Dict[str, Any]:
# Enable streaming to allow wrapping the response in the decompressor
# in transport_response_simplified.
params = super().request_params(identifier)
params["stream"] = True
return params
def transport_response_simplified(self, response):
"""Decompress and parse the package index fetched in `transport_request`.
For each package, we "pivot" the file list entries (Files,
Checksums-Sha1, Checksums-Sha256), to return a files dict mapping
filenames to their checksums.
"""
if self.decompressor:
data = self.decompressor(response.raw)
decompressor = decompressors.get(compression)
if decompressor:
data = decompressor(response.raw)
else:
data = response.raw
for src_pkg in Sources.iter_paragraphs(data.readlines()):
files = defaultdict(dict)
return Sources.iter_paragraphs(data.readlines())
for field in src_pkg._multivalued_fields:
if field.startswith("checksums-"):
sum_name = field[len("checksums-") :]
def get_pages(self) -> Iterator[DebianPageType]:
"""Return an iterator on parsed debian package Sources files, one per combination
of debian suite and component."""
for suite, component in product(self.suites, self.components):
logger.debug(
"Processing %s %s source packages info for %s component.",
self.instance,
suite,
component,
)
self.current_suite = suite
self.current_component = component
yield self.page_request(suite, component)
def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin:
"""Return the origin url for the given package"""
return f"deb://{self.instance}/packages/{package_name}"
def get_origins_from_page(self, page: DebianPageType) -> Iterator[ListedOrigin]:
"""Convert a page of debian package sources into an iterator of ListedOrigin.
Please note that the returned origins correspond to packages only
listed for the first time in order to get an accurate origins counter
in the statistics returned by the run method of the lister.
Packages already listed in another page but with different versions will
be put in cache by the method and updated ListedOrigin objects will
be sent to the scheduler later in the commit_page method.
Indeed as multiple debian suites can be processed, a similar set of
package names can be listed for two different package source pages,
only their version will differ, resulting in origins counted multiple
times in lister statistics.
"""
assert self.lister_obj.id is not None
origins_to_send = {}
self.origins_to_update = {}
# iterate on each package source info
for src_pkg in page:
# gather package files info that will be used by the debian loader
files: Dict[str, Dict[str, Any]] = defaultdict(dict)
for field_ in src_pkg._multivalued_fields:
if field_.startswith("checksums-"):
sum_name = field_[len("checksums-") :]
else:
sum_name = "md5sum"
if field in src_pkg:
for entry in src_pkg[field]:
if field_ in src_pkg:
for entry in src_pkg[field_]:
name = entry["name"]
files[name]["name"] = entry["name"]
files[name]["size"] = int(entry["size"], 10)
files[name][sum_name] = entry[sum_name]
yield {
"name": src_pkg["Package"],
"version": src_pkg["Version"],
"directory": src_pkg["Directory"],
"files": files,
}
# extract package name and version
package_name = src_pkg["Package"]
package_version = src_pkg["Version"]
# build origin url
origin_url = self.origin_url_for_package(package_name)
def inject_repo_data_into_db(self, models_list):
"""Generate the Package entries that didn't previously exist.
Contrary to ListerBase, we don't actually insert the data in
database. `schedule_missing_tasks` does it once we have the
origin and task identifiers.
"""
by_name_version = {}
temp_packages = []
area_id = self.area.id
for model in models_list:
name = model["name"]
version = model["version"]
temp_packages.append(
{"area_id": area_id, "name": name, "version": version,}
)
by_name_version[name, version] = model
# Add all the listed packages to a temporary table
self.db_session.execute(CreateTable(TempPackage.__table__))
self.db_session.bulk_insert_mappings(TempPackage, temp_packages)
def exists_tmp_pkg(db_session, model):
return (
db_session.query(model)
.filter(Package.area_id == TempPackage.area_id)
.filter(Package.name == TempPackage.name)
.filter(Package.version == TempPackage.version)
.exists()
# create package version key as expected by the debian loader
package_version_key = (
f"{self.current_suite}/{self.current_component}/{package_version}"
)
# Filter out the packages that already exist in the main Package table
new_packages = (
self.db_session.query(TempPackage)
.options(load_only("name", "version"))
.filter(~exists_tmp_pkg(self.db_session, Package))
.all()
)
# this is the first time a package is listed
if origin_url not in self.listed_origins:
# create a ListedOrigin object for it that can be later
# updated with new package versions info
self.listed_origins[origin_url] = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="deb",
extra_loader_arguments={"date": None, "packages": {}},
)
# origin will be yielded at the end of that method
origins_to_send[origin_url] = self.listed_origins[origin_url]
# init set that will contain all listed package versions
self.package_versions[package_name] = set()
self.old_area_packages = (
self.db_session.query(Package)
.filter(exists_tmp_pkg(self.db_session, TempPackage))
.all()
)
# package has already been listed in a previous page or current page
elif origin_url not in origins_to_send:
# if package has been listed in a previous page, its new versions
# will be added to its ListedOrigin object but the update will
# be sent to the scheduler in the commit_page method
self.origins_to_update[origin_url] = self.listed_origins[origin_url]
self.db_session.execute(DropTable(TempPackage.__table__))
# update package versions data in parameter that will be provided
# to the debian loader
self.listed_origins[origin_url].extra_loader_arguments["packages"].update(
{
package_version_key: {
"name": package_name,
"version": package_version,
"files": files,
}
}
)
added_packages = []
for package in new_packages:
model = by_name_version[package.name, package.version]
# add package version key to the set of found versions
self.package_versions[package_name].add(package_version_key)
added_packages.append(Package(area=self.area, **model))
# package has already been listed during a previous listing process
if package_name in self.state.package_versions:
new_versions = (
self.package_versions[package_name]
- self.state.package_versions[package_name]
)
# no new versions so far, no need to send the origin to the scheduler
if not new_versions:
origins_to_send.pop(origin_url, None)
self.origins_to_update.pop(origin_url, None)
# new versions found, ensure the origin will be sent to the scheduler
elif origin_url not in self.sent_origins:
self.origins_to_update.pop(origin_url, None)
origins_to_send[origin_url] = self.listed_origins[origin_url]
self.db_session.add_all(added_packages)
return added_packages
def schedule_missing_tasks(self, models_list, added_packages):
"""We create tasks at the end of the full snapshot processing"""
return
def create_tasks_for_snapshot(self, snapshot):
tasks = [
snapshot.task_for_package(name, versions)
for name, versions in snapshot.get_packages().items()
]
return self.scheduler.create_tasks(tasks)
def run(self):
"""Run the lister for a given (distribution, area) tuple.
"""
distribution = (
self.db_session.query(Distribution)
.options(joinedload(Distribution.areas))
.filter(Distribution.name == self.distribution)
.one_or_none()
)
if not distribution:
logger.error("Distribution %s is not registered" % self.distribution)
return {"status": "failed"}
if not distribution.type == "deb":
logger.error("Distribution %s is not a Debian derivative" % distribution)
return {"status": "failed"}
date = self.date
# update already counted origins with changes since last page
self.sent_origins.update(origins_to_send.keys())
logger.debug(
"Creating snapshot for distribution %s on date %s" % (distribution, date)
"Found %s new packages, %s packages with new versions.",
len(origins_to_send),
len(self.origins_to_update),
)
logger.debug(
"Current total number of listed packages is equal to %s.",
len(self.listed_origins),
)
snapshot = DistributionSnapshot(date=date, distribution=distribution)
yield from origins_to_send.values()
self.db_session.add(snapshot)
def get_origins_to_update(self) -> Iterator[ListedOrigin]:
yield from self.origins_to_update.values()
for area in distribution.areas:
if not area.active:
continue
def commit_page(self, page: DebianPageType):
"""Send to scheduler already listed origins where new versions have been found
in current page."""
self.send_origins(self.get_origins_to_update())
self.area = area
logger.debug("Processing area %s" % area)
_, new_area_packages = self.ingest_data(None)
area_snapshot = AreaSnapshot(snapshot=snapshot, area=area)
self.db_session.add(area_snapshot)
area_snapshot.packages.extend(new_area_packages)
area_snapshot.packages.extend(self.old_area_packages)
self.create_tasks_for_snapshot(snapshot)
self.db_session.commit()
return {"status": "eventful"}
def finalize(self):
# set mapping between listed package names and versions as lister state
self.state.package_versions = self.package_versions
self.updated = len(self.sent_origins) > 0

View file

@ -1,230 +0,0 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import binascii
from collections import defaultdict
import datetime
from typing import Any, Mapping
from sqlalchemy import (
Boolean,
Column,
DateTime,
Enum,
ForeignKey,
Integer,
LargeBinary,
String,
Table,
UniqueConstraint,
)
try:
from sqlalchemy import JSON
except ImportError:
# SQLAlchemy < 1.1
from sqlalchemy.dialects.postgresql import JSONB as JSON
from sqlalchemy.orm import relationship
from swh.lister.core.models import SQLBase
class Distribution(SQLBase):
"""A distribution (e.g. Debian, Ubuntu, Fedora, ...)"""
__tablename__ = "distribution"
id = Column(Integer, primary_key=True)
name = Column(String, unique=True, nullable=False)
type = Column(Enum("deb", "rpm", name="distribution_types"), nullable=False)
mirror_uri = Column(String, nullable=False)
areas = relationship("Area", back_populates="distribution")
def origin_for_package(self, package_name: str) -> str:
"""Return the origin url for the given package
"""
return "%s://%s/packages/%s" % (self.type, self.name, package_name)
def __repr__(self):
return "Distribution(%s (%s) on %s)" % (self.name, self.type, self.mirror_uri,)
class Area(SQLBase):
__tablename__ = "area"
__table_args__ = (UniqueConstraint("distribution_id", "name"),)
id = Column(Integer, primary_key=True)
distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False)
name = Column(String, nullable=False)
active = Column(Boolean, nullable=False, default=True)
distribution = relationship("Distribution", back_populates="areas")
def index_uris(self):
"""Get possible URIs for this component's package index"""
if self.distribution.type == "deb":
compression_exts = ("xz", "bz2", "gz", None)
base_uri = "%s/dists/%s/source/Sources" % (
self.distribution.mirror_uri,
self.name,
)
for ext in compression_exts:
if ext:
yield (base_uri + "." + ext, ext)
else:
yield (base_uri, None)
else:
raise NotImplementedError(
"Do not know how to build index URI for Distribution type %s"
% self.distribution.type
)
def __repr__(self):
return "Area(%s of %s)" % (self.name, self.distribution.name,)
class Package(SQLBase):
__tablename__ = "package"
__table_args__ = (UniqueConstraint("area_id", "name", "version"),)
id = Column(Integer, primary_key=True)
area_id = Column(Integer, ForeignKey("area.id"), nullable=False)
name = Column(String, nullable=False)
version = Column(String, nullable=False)
directory = Column(String, nullable=False)
files = Column(JSON, nullable=False)
origin_id = Column(Integer)
task_id = Column(Integer)
revision_id = Column(LargeBinary(20))
area = relationship("Area")
@property
def distribution(self):
return self.area.distribution
def fetch_uri(self, filename):
"""Get the URI to fetch the `filename` file associated with the
package"""
if self.distribution.type == "deb":
return "%s/%s/%s" % (
self.distribution.mirror_uri,
self.directory,
filename,
)
else:
raise NotImplementedError(
"Do not know how to build fetch URI for Distribution type %s"
% self.distribution.type
)
def loader_dict(self):
ret = {
"id": self.id,
"name": self.name,
"version": self.version,
}
if self.revision_id:
ret["revision_id"] = binascii.hexlify(self.revision_id).decode()
else:
files = {name: checksums.copy() for name, checksums in self.files.items()}
for name in files:
files[name]["uri"] = self.fetch_uri(name)
ret.update(
{"revision_id": None, "files": files,}
)
return ret
def __repr__(self):
return "Package(%s_%s of %s %s)" % (
self.name,
self.version,
self.distribution.name,
self.area.name,
)
class DistributionSnapshot(SQLBase):
__tablename__ = "distribution_snapshot"
id = Column(Integer, primary_key=True)
date = Column(DateTime, nullable=False, index=True)
distribution_id = Column(Integer, ForeignKey("distribution.id"), nullable=False)
distribution = relationship("Distribution")
areas = relationship("AreaSnapshot", back_populates="snapshot")
def task_for_package(
self, package_name: str, package_versions: Mapping
) -> Mapping[str, Any]:
"""Return the task dictionary for the given list of package versions
"""
origin_url = self.distribution.origin_for_package(package_name)
return {
"policy": "oneshot",
"type": "load-%s-package" % self.distribution.type,
"next_run": datetime.datetime.now(tz=datetime.timezone.utc),
"arguments": {
"args": [],
"kwargs": {
"url": origin_url,
"date": self.date.isoformat(),
"packages": package_versions,
},
},
"retries_left": 3,
}
def get_packages(self):
packages = defaultdict(dict)
for area_snapshot in self.areas:
area_name = area_snapshot.area.name
for package in area_snapshot.packages:
ref_name = "%s/%s" % (area_name, package.version)
packages[package.name][ref_name] = package.loader_dict()
return packages
area_snapshot_package_assoc = Table(
"area_snapshot_package",
SQLBase.metadata,
Column("area_snapshot_id", Integer, ForeignKey("area_snapshot.id"), nullable=False),
Column("package_id", Integer, ForeignKey("package.id"), nullable=False),
)
class AreaSnapshot(SQLBase):
__tablename__ = "area_snapshot"
id = Column(Integer, primary_key=True)
snapshot_id = Column(
Integer, ForeignKey("distribution_snapshot.id"), nullable=False
)
area_id = Column(Integer, ForeignKey("area.id"), nullable=False)
snapshot = relationship("DistributionSnapshot", back_populates="areas")
area = relationship("Area")
packages = relationship("Package", secondary=area_snapshot_package_assoc)
class TempPackage(SQLBase):
__tablename__ = "temp_package"
__table_args__ = {
"prefixes": ["TEMPORARY"],
}
id = Column(Integer, primary_key=True)
area_id = Column(Integer)
name = Column(String)
version = Column(String)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2018 the Software Heritage developers
# Copyright (C) 2017-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -8,9 +8,9 @@ from .lister import DebianLister
@shared_task(name=__name__ + ".DebianListerTask")
def list_debian_distribution(distribution, **lister_args):
def list_debian_distribution(**lister_args):
"""List a Debian distribution"""
return DebianLister(distribution=distribution, **lister_args).run()
return DebianLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")

View file

@ -1,61 +0,0 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import pytest
from sqlalchemy.orm import sessionmaker
from swh.core.db.pytest_plugin import postgresql_fact
from swh.lister.debian import debian_init
import swh.scheduler
SQL_DIR = os.path.join(os.path.dirname(swh.scheduler.__file__), "sql")
postgresql_scheduler = postgresql_fact(
"postgresql_proc",
db_name="scheduler-lister",
dump_files=os.path.join(SQL_DIR, "*.sql"),
# do not truncate the task tables, it's required in between test
no_truncate_tables={"dbversion", "priority_ratio", "task"},
)
@pytest.fixture
def swh_scheduler_config(postgresql_scheduler):
return {"db": postgresql_scheduler.dsn}
@pytest.fixture
def lister_under_test():
return "debian"
@pytest.fixture
def lister_debian(swh_lister):
# Initialize the debian data model
debian_init(
swh_lister.db_engine, suites=["stretch"], components=["main", "contrib"]
)
# Add the load-deb-package in the scheduler backend
swh_lister.scheduler.create_task_type(
{
"type": "load-deb-package",
"description": "Load a Debian package",
"backend_name": "swh.loader.packages.debian.tasks.LoaderDebianPackage",
"default_interval": "1 day",
}
)
return swh_lister
@pytest.fixture
def session(lister_db_url, engine):
session = sessionmaker(bind=engine)()
yield session
session.close()
engine.dispose()

View file

@ -0,0 +1,107 @@
Package: git
Binary: git, git-man, git-doc, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all
Version: 1:2.29.2-1
Maintainer: Jonathan Nieder <jrnieder@gmail.com>
Uploaders: Anders Kaseorg <andersk@mit.edu>
Build-Depends: libz-dev, gettext, libpcre2-dev | libpcre3-dev, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, python3, libhttp-date-perl | libtime-parsedate-perl, libcgi-pm-perl, liberror-perl, libmailtools-perl, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper-compat (= 10), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~)
Build-Depends-Indep: asciidoc (>= 8.6.10), xmlto, docbook-xsl
Architecture: any all
Standards-Version: 4.3.0.1
Format: 3.0 (quilt)
Files:
ef246c390b2673819cd55085984fb6bc 2867 git_2.29.2-1.dsc
f5f9d4e7a3c633bc7a9178cfd822045f 6187988 git_2.29.2.orig.tar.xz
cfed1fd3dffd4fb31a0319e51471877f 663292 git_2.29.2-1.debian.tar.xz
Vcs-Browser: https://repo.or.cz/w/git/debian.git/
Vcs-Git: https://repo.or.cz/r/git/debian.git/
Checksums-Sha256:
9f2203314f0d076e24750fa29f38d1bb49d4124f3e8d8789b751c84473e57ead 2867 git_2.29.2-1.dsc
f2fc436ebe657821a1360bcd1e5f4896049610082419143d60f6fa13c2f607c1 6187988 git_2.29.2.orig.tar.xz
ad79671893257ca6205156c7c58d06e265d793f076c0efc8e225e832217f760a 663292 git_2.29.2-1.debian.tar.xz
Homepage: https://git-scm.com/
Package-List:
git deb vcs optional arch=any
git-all deb vcs optional arch=all
git-cvs deb vcs optional arch=all
git-daemon-run deb vcs optional arch=all
git-daemon-sysvinit deb vcs optional arch=all
git-doc deb doc optional arch=all
git-el deb vcs optional arch=all
git-email deb vcs optional arch=all
git-gui deb vcs optional arch=all
git-man deb doc optional arch=all
git-mediawiki deb vcs optional arch=all
git-svn deb vcs optional arch=all
gitk deb vcs optional arch=all
gitweb deb vcs optional arch=all
Directory: pool/main/g/git
Priority: source
Section: vcs
Package: subversion
Binary: subversion, libsvn1, libsvn-dev, libsvn-doc, libapache2-mod-svn, python3-subversion, subversion-tools, libsvn-java, libsvn-perl, ruby-svn
Version: 1.14.0-3
Maintainer: James McCoy <jamessan@debian.org>
Build-Depends: autoconf, bash-completion, chrpath, debhelper-compat (= 12), default-jdk-headless (>= 2:1.8) [!hurd-i386 !hppa !sparc] <!pkg.subversion.nojava>, dh-apache2, dh-python, doxygen, junit4 [!hurd-i386 !hppa !sparc] <!pkg.subversion.nojava>, libapr1-dev, libaprutil1-dev, libdb5.3-dev, libdbus-1-dev, liblz4-dev (>= 0.0~r129), libkf5coreaddons-dev <!pkg.subversion.nokde>, libkf5i18n-dev <!pkg.subversion.nokde>, libkf5wallet-dev <!pkg.subversion.nokde>, libperl-dev, libsasl2-dev, libsecret-1-dev, libserf-dev (>= 1.3.9-4~), libsqlite3-dev (>= 3.8.7), libtool, libutf8proc-dev, perl, py3c-dev, python3-all-dev, rename, ruby <!pkg.subversion.noruby>, ruby-dev <!pkg.subversion.noruby>, swig (>= 3.0.10), zlib1g-dev
Build-Conflicts: libsvn-dev (>= 1.15~), libsvn-dev (<< 1.14~), libsvn1 (>= 1.15~), libsvn1 (<< 1.14~)
Architecture: any all
Standards-Version: 4.5.0
Format: 3.0 (quilt)
Files:
65f7c225ddbcc855b57341954268098b 3807 subversion_1.14.0-3.dsc
0136e67d8f58731b2858b9f2dba7c536 11519871 subversion_1.14.0.orig.tar.gz
f68b938ba71e19f333069bfd3c6ec236 3917 subversion_1.14.0.orig.tar.gz.asc
de6248e80a7f8b6481606ff16a9e9237 427396 subversion_1.14.0-3.debian.tar.xz
Vcs-Browser: https://salsa.debian.org/jamessan/subversion
Vcs-Git: https://salsa.debian.org/jamessan/subversion.git
Checksums-Sha256:
ebe6e2417a79ad5254072d994ccf6313489a90f299304ee2ccfb6ebe1392c580 3807 subversion_1.14.0-3.dsc
ef3d1147535e41874c304fb5b9ea32745fbf5d7faecf2ce21d4115b567e937d0 11519871 subversion_1.14.0.orig.tar.gz
98333df38d29a64500d4ad1693741d3d087485555207289b4e53af309abac71a 3917 subversion_1.14.0.orig.tar.gz.asc
fd5383bf82ccf89acd7caf0fd80dc01ee2f7a3e163dcab6b2646ad01b7b746d9 427396 subversion_1.14.0-3.debian.tar.xz
Homepage: http://subversion.apache.org/
Dgit: 6ef306f777223c0d5c2eaab0586420ada61435f3 debian archive/debian/1.14.0-3 https://git.dgit.debian.org/subversion
Package-List:
libapache2-mod-svn deb httpd optional arch=any
libsvn-dev deb libdevel optional arch=any
libsvn-doc deb doc optional arch=all
libsvn-java deb java optional arch=any profile=!pkg.subversion.nojava
libsvn-perl deb perl optional arch=any
libsvn1 deb libs optional arch=any
python3-subversion deb python optional arch=any
ruby-svn deb ruby optional arch=any profile=!pkg.subversion.noruby
subversion deb vcs optional arch=any
subversion-tools deb vcs optional arch=any
Testsuite: autopkgtest
Testsuite-Triggers: apache2, wget
Directory: pool/main/s/subversion
Priority: source
Section: vcs
Package: hg-git
Binary: mercurial-git
Version: 0.9.0-2
Maintainer: Debian Python Team <team+python@tracker.debian.org>
Uploaders: Tristan Seligmann <mithrandi@debian.org>
Build-Depends: debhelper-compat (= 13), dh-python, git, python3-mercurial, openssh-client, python3, python3-dulwich (>= 0.20.6), python3-setuptools, unzip
Architecture: all
Standards-Version: 4.5.0
Format: 3.0 (quilt)
Files:
7dee1b877cf129c1f6ee618ebf690179 2090 hg-git_0.9.0-2.dsc
bcf30d513d8463332288aa93c1c67d3e 129138 hg-git_0.9.0.orig.tar.bz2
5674d6e2e8271150adf68b08833e4806 6996 hg-git_0.9.0-2.debian.tar.xz
Vcs-Browser: https://salsa.debian.org/python-team/packages/hg-git
Vcs-Git: https://salsa.debian.org/python-team/packages/hg-git.git
Checksums-Sha256:
a40beaef731c00a820d89918afedc1f01580d87f6e8c29e74903b1e108e38b27 2090 hg-git_0.9.0-2.dsc
eedd8773de76b21b47fd21a7e5c04c05c7ab0ecfc62a54bc947eb225b2c44424 129138 hg-git_0.9.0.orig.tar.bz2
ded524f1688a248a0eefbd0cf9843daedf60001cc39bfbb9e89734742fa4a4d2 6996 hg-git_0.9.0-2.debian.tar.xz
Homepage: https://hg-git.github.io/
Package-List:
mercurial-git deb vcs optional arch=all
Testsuite: autopkgtest
Testsuite-Triggers: git, openssh-client, unzip
Directory: pool/main/h/hg-git
Priority: source
Section: vcs

View file

@ -0,0 +1,78 @@
Package: git
Binary: git, git-man, git-doc, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all
Version: 1:2.20.1-2+deb10u3
Maintainer: Gerrit Pape <pape@smarden.org>
Uploaders: Jonathan Nieder <jrnieder@gmail.com>, Anders Kaseorg <andersk@mit.edu>
Build-Depends: libz-dev, gettext, libpcre2-dev | libpcre3-dev, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, python, libhttp-date-perl | libtime-parsedate-perl, libcgi-pm-perl, liberror-perl, libmailtools-perl, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper (>= 9), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~)
Build-Depends-Indep: asciidoc (>= 8.6.10), xmlto, docbook-xsl
Architecture: any all
Standards-Version: 4.3.0.1
Format: 3.0 (quilt)
Files:
fcfb1e01b74dfa383f8171ae7d331de9 2923 git_2.20.1-2+deb10u3.dsc
5fb4ff92b56ce3172b99c1c74c046c1a 5359872 git_2.20.1.orig.tar.xz
3b629f9b0d2da6fa6ce5816478a57e09 646216 git_2.20.1-2+deb10u3.debian.tar.xz
Vcs-Browser: https://repo.or.cz/w/git/debian.git/
Vcs-Git: https://repo.or.cz/r/git/debian.git/
Checksums-Sha256:
6322d0dbe9b867a6cd1cd75f95a4a20335faa2030c38688f460ddaaaacbd4d06 2923 git_2.20.1-2+deb10u3.dsc
9d2e91e2faa2ea61ba0a70201d023b36f54d846314591a002c610ea2ab81c3e9 5359872 git_2.20.1.orig.tar.xz
3c6e2f8495350bccd0981d579d4d1cac6b0e051e1f7ba8b1d22c842bd4cb3453 646216 git_2.20.1-2+deb10u3.debian.tar.xz
Homepage: https://git-scm.com/
Package-List:
git deb vcs optional arch=any
git-all deb vcs optional arch=all
git-cvs deb vcs optional arch=all
git-daemon-run deb vcs optional arch=all
git-daemon-sysvinit deb vcs optional arch=all
git-doc deb doc optional arch=all
git-el deb vcs optional arch=all
git-email deb vcs optional arch=all
git-gui deb vcs optional arch=all
git-man deb doc optional arch=all
git-mediawiki deb vcs optional arch=all
git-svn deb vcs optional arch=all
gitk deb vcs optional arch=all
gitweb deb vcs optional arch=all
Directory: pool/main/g/git
Priority: source
Section: vcs
Package: subversion
Binary: subversion, libsvn1, libsvn-dev, libsvn-doc, libapache2-mod-svn, python-subversion, subversion-tools, libsvn-java, libsvn-perl, ruby-svn
Version: 1.10.4-1+deb10u1
Maintainer: James McCoy <jamessan@debian.org>
Build-Depends: apache2-dev (>= 2.4.16), autoconf, bash-completion, chrpath, debhelper (>= 11~), default-jdk-headless (>= 2:1.6) [!hurd-i386 !hppa !sparc], dh-apache2, dh-python, doxygen, junit [!hurd-i386 !hppa !sparc], libapr1-dev, libaprutil1-dev, libdb5.3-dev, libdbus-1-dev, liblz4-dev (>= 0.0~r129), libkf5coreaddons-dev, libkf5i18n-dev, libkf5wallet-dev, libperl-dev, libsasl2-dev, libsecret-1-dev, libserf-dev (>= 1.3.9-4~), libsqlite3-dev (>= 3.8.7), libtool, libutf8proc-dev, perl, python-all-dev (>= 2.7), rename, ruby, ruby-dev, swig, zlib1g-dev
Build-Conflicts: libsvn-dev (<< 1.10~)
Architecture: any all
Standards-Version: 4.3.0
Format: 3.0 (quilt)
Files:
70b1d3c8ae91301a3f7766b8181d09c9 3428 subversion_1.10.4-1+deb10u1.dsc
fcfd1bcd95a8b44e6a6de3a97425aead 11347907 subversion_1.10.4.orig.tar.gz
98e9c6902e6a18973b3d936657384a88 2107 subversion_1.10.4.orig.tar.gz.asc
a4a14bcff3cef49d0d9388356213f3e4 438024 subversion_1.10.4-1+deb10u1.debian.tar.xz
Vcs-Browser: https://salsa.debian.org/jamessan/subversion
Vcs-Git: https://salsa.debian.org/jamessan/subversion.git
Checksums-Sha256:
c9956fd5b850924dd123048b39195b3d591f55b9cbdf18d4d2a0f496f7decc72 3428 subversion_1.10.4-1+deb10u1.dsc
354022a837596eb1b5676639ea8d73aa326fa8b2c610d8e1b39aeb7228921f4e 11347907 subversion_1.10.4.orig.tar.gz
bc6173c43ac837f875d9f2921e118c194455796b419769e155496cf084376428 2107 subversion_1.10.4.orig.tar.gz.asc
1bc8900ef1b9d2af84827dab0fd0164e2058381be3bba0db6fd13cbc858c9b1e 438024 subversion_1.10.4-1+deb10u1.debian.tar.xz
Homepage: http://subversion.apache.org/
Package-List:
libapache2-mod-svn deb httpd optional arch=any
libsvn-dev deb libdevel optional arch=any
libsvn-doc deb doc optional arch=all
libsvn-java deb java optional arch=any
libsvn-perl deb perl optional arch=any
libsvn1 deb libs optional arch=any
python-subversion deb python optional arch=any
ruby-svn deb ruby optional arch=any
subversion deb vcs optional arch=any
subversion-tools deb vcs optional arch=any
Testsuite: autopkgtest
Testsuite-Triggers: apache2, wget
Directory: pool/main/s/subversion
Priority: source
Section: vcs

View file

@ -0,0 +1,113 @@
Package: dh-elpa
Binary: dh-elpa
Version: 0.0.18
Maintainer: Debian Emacs addons team <pkg-emacsen-addons@lists.alioth.debian.org>
Uploaders: David Bremner <bremner@debian.org>
Build-Depends: debhelper (>= 9), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~)
Architecture: all
Standards-Version: 3.9.6
Format: 1.0
Files:
25beb4376110fe075460f4b7776d0349 1471 dh-elpa_0.0.18.dsc
dc0d3b42c1db80cac9817f43c171bfb3 10038 dh-elpa_0.0.18.tar.gz
Vcs-Browser: http://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/
Vcs-Git: git://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git
Checksums-Sha256:
87fb2f13d4a8cdea0cec752cc9873eef1c92961655315d2f14d178f9b1b7fc43 1471 dh-elpa_0.0.18.dsc
24e5be28cda286398db0018d9577493445c61a0602e239ca285a2981f1068b10 10038 dh-elpa_0.0.18.tar.gz
Package-List:
dh-elpa deb devel optional arch=all
Extra-Source-Only: yes
Directory: pool/main/d/dh-elpa
Priority: extra
Section: misc
Package: dh-elpa
Binary: dh-elpa
Version: 0.0.19
Maintainer: Debian Emacs addons team <pkg-emacsen-addons@lists.alioth.debian.org>
Uploaders: David Bremner <bremner@debian.org>
Build-Depends: debhelper (>= 9), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~)
Architecture: all
Standards-Version: 3.9.6
Format: 1.0
Files:
e4513c0f2112ba60031777ad0a65f9dc 1471 dh-elpa_0.0.19.dsc
ac70db483578ecac510612e1b894e53b 10291 dh-elpa_0.0.19.tar.gz
Vcs-Browser: http://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/
Vcs-Git: git://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git
Checksums-Sha256:
796a96fad0b03eb589f47c44406f8d32e5b8881dce34c425f1c915650618235c 1471 dh-elpa_0.0.19.dsc
4bb0a0ecdb75585e168a56a53c79e620b2da70584db9d29e136a3ae9f8a92a76 10291 dh-elpa_0.0.19.tar.gz
Package-List:
dh-elpa deb devel optional arch=all
Extra-Source-Only: yes
Directory: pool/main/d/dh-elpa
Priority: extra
Section: misc
Package: dh-elpa
Binary: dh-elpa
Version: 0.0.20
Maintainer: Debian Emacs addons team <pkg-emacsen-addons@lists.alioth.debian.org>
Uploaders: David Bremner <bremner@debian.org>, Sean Whitton <spwhitton@spwhitton.name>,
Build-Depends: debhelper (>= 9.20151004), emacs24-nox | emacs24 (>= 24~) | emacs24-lucid (>= 24~)
Architecture: all
Standards-Version: 3.9.8
Format: 1.0
Files:
82455df65ccd88896cdc083541d29236 1526 dh-elpa_0.0.20.dsc
4a7cc13b097e44228b5635c400e33202 12884 dh-elpa_0.0.20.tar.gz
Vcs-Browser: https://anonscm.debian.org/cgit/pkg-emacsen/pkg/dh-elpa.git/
Vcs-Git: https://anonscm.debian.org/pkg-emacsen/pkg/dh-elpa.git
Checksums-Sha256:
77c9761b1359c256ad25d4c7a826a27643a0094929a4cb3ac8cdaa0fcdb02d1b 1526 dh-elpa_0.0.20.dsc
13e4c6ffaaa6cd793d19de677af470ac0edac098779627e9f8555644a7da42f0 12884 dh-elpa_0.0.20.tar.gz
Package-List:
dh-elpa deb devel optional arch=all
Extra-Source-Only: yes
Directory: pool/main/d/dh-elpa
Priority: extra
Section: misc
Package: git
Binary: git, git-man, git-core, git-doc, git-arch, git-cvs, git-svn, git-mediawiki, git-email, git-daemon-run, git-daemon-sysvinit, git-gui, gitk, git-el, gitweb, git-all
Version: 1:2.11.0-3+deb9u7
Maintainer: Gerrit Pape <pape@smarden.org>
Uploaders: Jonathan Nieder <jrnieder@gmail.com>, Anders Kaseorg <andersk@mit.edu>
Build-Depends: libz-dev, libpcre3-dev, gettext, libcurl4-gnutls-dev, libexpat1-dev, subversion, libsvn-perl, libyaml-perl, tcl, libhttp-date-perl | libtime-modules-perl, libcgi-pm-perl, python, cvs, cvsps, libdbd-sqlite3-perl, unzip, libio-pty-perl, debhelper (>= 9), dh-exec (>= 0.7), dh-apache2, dpkg-dev (>= 1.16.2~)
Build-Depends-Indep: asciidoc, xmlto, docbook-xsl
Architecture: any all
Standards-Version: 3.9.6.0
Format: 3.0 (quilt)
Files:
e594aeada05ecb15253cc5768412ce3b 2944 git_2.11.0-3+deb9u7.dsc
dd4e3360e28aec5bb902fb34dd7fce3b 4197984 git_2.11.0.orig.tar.xz
e8d896e5307397f0e106e6a85c1b8682 610188 git_2.11.0-3+deb9u7.debian.tar.xz
Vcs-Browser: http://repo.or.cz/w/git/debian.git/
Vcs-Git: https://repo.or.cz/r/git/debian.git/
Checksums-Sha256:
7f2be1b1709c216ad06590687cc8fc0ff6b55a6c3e0ad6ec32b2567ce10adec1 2944 git_2.11.0-3+deb9u7.dsc
7e7e8d69d494892373b87007674be5820a4bc1ef596a0117d03ea3169119fd0b 4197984 git_2.11.0.orig.tar.xz
3f54b7ea7b8cda477ddb559c63de063c5bd49d8ab772330c05c79ace546ce38d 610188 git_2.11.0-3+deb9u7.debian.tar.xz
Homepage: https://git-scm.com/
Package-List:
git deb vcs optional arch=any
git-all deb vcs optional arch=all
git-arch deb vcs optional arch=all
git-core deb vcs optional arch=all
git-cvs deb vcs optional arch=all
git-daemon-run deb vcs optional arch=all
git-daemon-sysvinit deb vcs extra arch=all
git-doc deb doc optional arch=all
git-el deb vcs optional arch=all
git-email deb vcs optional arch=all
git-gui deb vcs optional arch=all
git-man deb doc optional arch=all
git-mediawiki deb vcs optional arch=all
git-svn deb vcs optional arch=all
gitk deb vcs optional arch=all
gitweb deb vcs optional arch=all
Directory: pool/main/g/git
Priority: source
Section: vcs

View file

@ -1,92 +0,0 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.lister.debian import debian_init
from swh.lister.debian.models import Area, Distribution
@pytest.fixture
def engine(session):
session.autoflush = False
return session.bind
def test_debian_init_step(engine, session):
distribution_name = "KaliLinux"
distrib = (
session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
assert distrib is None
all_area = session.query(Area).all()
assert all_area == []
suites = ["wheezy", "jessie"]
components = ["main", "contrib"]
debian_init(
engine,
distribution_name=distribution_name,
suites=suites,
components=components,
)
distrib = (
session.query(Distribution)
.filter(Distribution.name == distribution_name)
.one_or_none()
)
assert distrib is not None
assert distrib.name == distribution_name
assert distrib.type == "deb"
assert distrib.mirror_uri == "http://deb.debian.org/debian/"
all_area = session.query(Area).all()
assert len(all_area) == 2 * 2, "2 suites * 2 components per suite"
expected_area_names = []
for suite in suites:
for component in components:
expected_area_names.append(f"{suite}/{component}")
for area in all_area:
area.id = None
assert area.distribution == distrib
assert area.name in expected_area_names
# check idempotency (on exact same call)
debian_init(
engine,
distribution_name=distribution_name,
suites=suites,
components=components,
)
distribs = (
session.query(Distribution).filter(Distribution.name == distribution_name).all()
)
assert len(distribs) == 1
distrib = distribs[0]
all_area = session.query(Area).all()
assert len(all_area) == 2 * 2, "2 suites * 2 components per suite"
# Add a new suite
debian_init(
engine,
distribution_name=distribution_name,
suites=["lenny"],
components=components,
)
all_area = [a.name for a in session.query(Area).all()]
assert len(all_area) == (2 + 1) * 2, "3 suites * 2 components per suite"

View file

@ -1,35 +1,201 @@
# Copyright (C) 2019 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple
logger = logging.getLogger(__name__)
from debian.deb822 import Sources
import pytest
from swh.lister.debian.lister import (
DebianLister,
DebianOrigin,
PkgName,
PkgVersion,
Suite,
)
from swh.scheduler.interface import SchedulerInterface
# Those tests use sample debian Sources files whose content has been extracted
# from the real Sources files from stretch, buster and bullseye suite.
# They contain the follwowing package source info
# - stretch:
# * dh-elpa (versions: 0.0.18, 0.0.19, 0.0.20),
# * git (version: 1:2.11.0-3+deb9u7)
# - buster:
# * git (version: 1:2.20.1-2+deb10u3),
# * subversion (version: 1.10.4-1+deb10u1)
# - bullseye:
# * git (version: 1:2.29.2-1)
# * subversion (version: 1.14.0-3)
# * hg-git (version: 0.9.0-2)
_mirror_url = "http://deb.debian.org/debian"
_suites = ["stretch", "buster", "bullseye"]
_components = ["main"]
SourcesText = str
def test_lister_debian(lister_debian, datadir, requests_mock_datadir):
"""Simple debian listing should create scheduled tasks
def _debian_sources_content(datadir: str, suite: Suite) -> SourcesText:
return Path(datadir, f"Sources_{suite}").read_text()
@pytest.fixture
def debian_sources(datadir: str) -> Dict[Suite, SourcesText]:
return {suite: _debian_sources_content(datadir, suite) for suite in _suites}
# suite -> package name -> list of versions
DebianSuitePkgSrcInfo = Dict[Suite, Dict[PkgName, List[Sources]]]
def _init_test(
swh_scheduler: SchedulerInterface,
debian_sources: Dict[Suite, SourcesText],
requests_mock,
) -> Tuple[DebianLister, DebianSuitePkgSrcInfo]:
lister = DebianLister(
scheduler=swh_scheduler,
mirror_url=_mirror_url,
suites=list(debian_sources.keys()),
components=_components,
)
suite_pkg_info: DebianSuitePkgSrcInfo = {}
for suite, sources in debian_sources.items():
suite_pkg_info[suite] = defaultdict(list)
for pkg_src in Sources.iter_paragraphs(sources):
suite_pkg_info[suite][pkg_src["Package"]].append(pkg_src)
for idx_url, compression in lister.debian_index_urls(suite, _components[0]):
if compression:
requests_mock.get(idx_url, status_code=404)
else:
requests_mock.get(idx_url, text=sources)
return lister, suite_pkg_info
def _check_listed_origins(
swh_scheduler: SchedulerInterface,
lister: DebianLister,
suite_pkg_info: DebianSuitePkgSrcInfo,
lister_previous_state: Dict[PkgName, Set[PkgVersion]],
) -> Set[DebianOrigin]:
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = set()
# iterate on each debian suite for the main component
for suite, pkg_info in suite_pkg_info.items():
# iterate on each package
for package_name, pkg_srcs in pkg_info.items():
# iterate on each package version info
for pkg_src in pkg_srcs:
# build package version key
package_version_key = f"{suite}/{_components[0]}/{pkg_src['Version']}"
# if package or its version not previously listed, those info should
# have been sent to the scheduler database
if (
package_name not in lister_previous_state
or package_version_key not in lister_previous_state[package_name]
):
# build origin url
origin_url = lister.origin_url_for_package(package_name)
origin_urls.add(origin_url)
# get ListerOrigin object from scheduler database
filtered_origins = [
scheduler_origin
for scheduler_origin in scheduler_origins
if scheduler_origin.url == origin_url
]
assert filtered_origins
# check the version info are available
assert (
package_version_key
in filtered_origins[0].extra_loader_arguments["packages"]
)
# check listed package version is in lister state
assert package_name in lister.state.package_versions
assert (
package_version_key
in lister.state.package_versions[package_name]
)
return origin_urls
def test_lister_debian_all_suites(
swh_scheduler: SchedulerInterface,
debian_sources: Dict[Suite, SourcesText],
requests_mock,
):
"""
# Run the lister
lister_debian.run()
Simulate a full listing of main component packages for all debian suites.
"""
lister, suite_pkg_info = _init_test(swh_scheduler, debian_sources, requests_mock)
r = lister_debian.scheduler.search_tasks(task_type="load-deb-package")
assert len(r) == 151
stats = lister.run()
for row in r:
assert row["type"] == "load-deb-package"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 0
origin_urls = _check_listed_origins(
swh_scheduler, lister, suite_pkg_info, lister_previous_state={}
)
# kwargs
kwargs = row["arguments"]["kwargs"]
assert set(kwargs.keys()) == {"url", "date", "packages"}
assert stats.pages == len(_suites) * len(_components)
assert stats.origins == len(origin_urls)
logger.debug("kwargs: %s", kwargs)
assert isinstance(kwargs["url"], str)
stats = lister.run()
assert row["policy"] == "oneshot"
assert row["priority"] is None
assert stats.pages == len(_suites) * len(_components)
assert stats.origins == 0
@pytest.mark.parametrize(
"suites_params",
[[_suites[:1]], [_suites[:1], _suites[:2]], [_suites[:1], _suites[:2], _suites],],
)
def test_lister_debian_updated_packages(
swh_scheduler: SchedulerInterface,
debian_sources: Dict[Suite, SourcesText],
requests_mock,
suites_params: List[Suite],
):
"""
Simulate incremental listing of main component packages by adding new suite
to process between each listing operation.
"""
lister_previous_state: Dict[PkgName, Set[PkgVersion]] = {}
for idx, suites in enumerate(suites_params):
sources = {suite: debian_sources[suite] for suite in suites}
lister, suite_pkg_info = _init_test(swh_scheduler, sources, requests_mock)
stats = lister.run()
origin_urls = _check_listed_origins(
swh_scheduler,
lister,
suite_pkg_info,
lister_previous_state=lister_previous_state,
)
assert stats.pages == len(sources)
assert stats.origins == len(origin_urls)
lister_previous_state = lister.state.package_versions
# only new packages or packages with new versions should be listed
if len(suites) > 1 and idx < len(suites) - 1:
assert stats.origins == 0
else:
assert stats.origins != 0

View file

@ -1,32 +0,0 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.lister.debian.models import Area, Distribution
def test_area_index_uris_deb(session):
d = Distribution(
name="Debian", type="deb", mirror_uri="http://deb.debian.org/debian"
)
a = Area(distribution=d, name="unstable/main", active=True,)
session.add_all([d, a])
session.commit()
uris = list(a.index_uris())
assert uris
def test_area_index_uris_rpm(session):
d = Distribution(
name="CentOS", type="rpm", mirror_uri="http://centos.mirrors.proxad.net/"
)
a = Area(distribution=d, name="8", active=True,)
session.add_all([d, a])
session.commit()
with pytest.raises(NotImplementedError):
list(a.index_uris())

View file

@ -1,10 +1,12 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.debian.tasks.ping")
@ -17,15 +19,25 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@patch("swh.lister.debian.tasks.DebianLister")
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
# setup the mocked DebianLister
lister.return_value = lister
lister.run.return_value = None
lister.from_configfile.return_value = lister
stats = ListerStats(pages=12, origins=35618)
lister.run.return_value = stats
kwargs = dict(
mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/",
distribution="Ubuntu",
suites=["xenial", "bionic", "focal"],
components=["main", "multiverse", "restricted", "universe"],
)
res = swh_scheduler_celery_app.send_task(
"swh.lister.debian.tasks.DebianListerTask", ("stretch",)
"swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs
)
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(distribution="stretch")
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
assert res.result == stats.dict()

View file

@ -1,83 +0,0 @@
# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import click
from swh.lister.debian.lister import DebianLister
from swh.lister.debian.models import Area, Distribution, SQLBase
@click.group()
@click.option("--verbose/--no-verbose", default=False)
@click.pass_context
def cli(ctx, verbose):
ctx.obj["lister"] = DebianLister()
if verbose:
loglevel = logging.DEBUG
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
else:
loglevel = logging.INFO
logging.basicConfig(
format="%(asctime)s %(process)d %(levelname)s %(message)s", level=loglevel,
)
@cli.command()
@click.pass_context
def create_schema(ctx):
"""Create the schema from the models"""
SQLBase.metadata.create_all(ctx.obj["lister"].db_engine)
@cli.command()
@click.option("--name", help="The name of the distribution")
@click.option("--type", help="The type of distribution")
@click.option("--mirror-uri", help="The URL to the mirror of the distribution")
@click.option("--area", help="The areas for the distribution", multiple=True)
@click.pass_context
def create_distribution(ctx, name, type, mirror_uri, area):
to_add = []
db_session = ctx.obj["lister"].db_session
d = (
db_session.query(Distribution)
.filter(Distribution.name == name)
.filter(Distribution.type == type)
.one_or_none()
)
if not d:
d = Distribution(name=name, type=type, mirror_uri=mirror_uri)
to_add.append(d)
for area_name in area:
a = None
if d.id:
a = (
db_session.query(Area)
.filter(Area.distribution == d)
.filter(Area.name == area_name)
.one_or_none()
)
if not a:
a = Area(name=area_name, distribution=d)
to_add.append(a)
db_session.add_all(to_add)
db_session.commit()
@cli.command()
@click.option("--name", help="The name of the distribution")
@click.pass_context
def list_distribution(ctx, name):
"""List the distribution"""
ctx.obj["lister"].run(name)
if __name__ == "__main__":
cli(obj={})