conda: Yield listed origins after all artifacts in a page are processed

swh-scheduler will deduplicate listed origins according to their URL
and visit type but not according to their extra loader arguments.

Previously, listed origins were yielded after each processed artifact
in a page so we could lose some package version info due to the
deduplication process.

So ensure to yield listed origins once all artifacts in a page have
been processed.
This commit is contained in:
Antoine Lambert 2022-10-19 15:59:25 +02:00
parent 31eb5f637f
commit 4f6b3f3f09
2 changed files with 59 additions and 34 deletions

View file

@ -71,7 +71,9 @@ class CondaLister(StatelessLister[CondaListerPage]):
assert self.lister_obj.id is not None
arch, packages = page
package_names = set()
for filename, package_metadata in packages.items():
package_names.add(package_metadata["name"])
version_key = (
f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
)
@ -102,22 +104,20 @@ class CondaLister(StatelessLister[CondaListerPage]):
elif "date" in package_metadata:
package_date = iso8601.parse_date(package_metadata["date"])
last_update = None
if package_date:
artifact["date"] = package_date.isoformat()
self.package_dates[package_metadata["name"]].append(package_date)
last_update = max(self.package_dates[package_metadata["name"]])
for package_name in package_names:
package_dates = self.package_dates[package_name]
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=self.ORIGIN_URL_PATTERN.format(
channel=self.channel, pkgname=package_metadata["name"]
channel=self.channel, pkgname=package_name
),
last_update=last_update,
last_update=max(package_dates, default=None),
extra_loader_arguments={
"artifacts": [
v for k, v in self.packages[package_metadata["name"]].items()
],
"artifacts": list(self.packages[package_name].values())
},
)

View file

@ -3,36 +3,14 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.lister.conda.lister import CondaLister
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
lister = CondaLister(
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
)
res = lister.run()
assert res.pages == 3
assert res.origins == 11
def test_conda_lister_conda_forge_channel(
datadir, requests_mock_datadir, swh_scheduler
):
lister = CondaLister(
scheduler=swh_scheduler,
url="https://conda.anaconda.org",
channel="conda-forge",
archs=["linux-64"],
)
res = lister.run()
assert res.pages == 1
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
expected_origins = [
@pytest.fixture
def expected_origins():
return [
{
"url": "https://anaconda.org/conda-forge/21cmfast",
"artifacts": [
@ -75,6 +53,33 @@ def test_conda_lister_conda_forge_channel(
},
]
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
lister = CondaLister(
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
)
res = lister.run()
assert res.pages == 3
assert res.origins == 11
def test_conda_lister_conda_forge_channel(
requests_mock_datadir, swh_scheduler, expected_origins
):
lister = CondaLister(
scheduler=swh_scheduler,
url="https://conda.anaconda.org",
channel="conda-forge",
archs=["linux-64"],
)
res = lister.run()
assert res.pages == 1
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert [
@ -92,3 +97,23 @@ def test_conda_lister_conda_forge_channel(
)
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
]
def test_conda_lister_number_of_yielded_origins(
requests_mock_datadir, swh_scheduler, expected_origins
):
"""Check that a single ListedOrigin instance is sent by expected origins."""
lister = CondaLister(
scheduler=swh_scheduler,
url="https://conda.anaconda.org",
channel="conda-forge",
archs=["linux-64"],
)
listed_origins = []
for page in lister.get_pages():
listed_origins += list(lister.get_origins_from_page(page))
assert sorted([listed_origin.url for listed_origin in listed_origins]) == sorted(
[origin["url"] for origin in expected_origins]
)