conda: Yield listed origins after all artifacts in a page are processed
swh-scheduler will deduplicate listed origins according to their URL and visit type but not according to their extra loader arguments. Previously, listed origins were yielded after each processed artifact in a page so we could lose some package version info due to the deduplication process. So ensure to yield listed origins once all artifacts in a page have been processed.
This commit is contained in:
parent
31eb5f637f
commit
4f6b3f3f09
2 changed files with 59 additions and 34 deletions
|
@ -71,7 +71,9 @@ class CondaLister(StatelessLister[CondaListerPage]):
|
|||
assert self.lister_obj.id is not None
|
||||
arch, packages = page
|
||||
|
||||
package_names = set()
|
||||
for filename, package_metadata in packages.items():
|
||||
package_names.add(package_metadata["name"])
|
||||
version_key = (
|
||||
f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
|
||||
)
|
||||
|
@ -102,22 +104,20 @@ class CondaLister(StatelessLister[CondaListerPage]):
|
|||
elif "date" in package_metadata:
|
||||
package_date = iso8601.parse_date(package_metadata["date"])
|
||||
|
||||
last_update = None
|
||||
if package_date:
|
||||
artifact["date"] = package_date.isoformat()
|
||||
self.package_dates[package_metadata["name"]].append(package_date)
|
||||
last_update = max(self.package_dates[package_metadata["name"]])
|
||||
|
||||
for package_name in package_names:
|
||||
package_dates = self.package_dates[package_name]
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
visit_type=self.VISIT_TYPE,
|
||||
url=self.ORIGIN_URL_PATTERN.format(
|
||||
channel=self.channel, pkgname=package_metadata["name"]
|
||||
channel=self.channel, pkgname=package_name
|
||||
),
|
||||
last_update=last_update,
|
||||
last_update=max(package_dates, default=None),
|
||||
extra_loader_arguments={
|
||||
"artifacts": [
|
||||
v for k, v in self.packages[package_metadata["name"]].items()
|
||||
],
|
||||
"artifacts": list(self.packages[package_name].values())
|
||||
},
|
||||
)
|
||||
|
|
|
@ -3,36 +3,14 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import pytest
|
||||
|
||||
from swh.lister.conda.lister import CondaLister
|
||||
|
||||
|
||||
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 11
|
||||
|
||||
|
||||
def test_conda_lister_conda_forge_channel(
|
||||
datadir, requests_mock_datadir, swh_scheduler
|
||||
):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler,
|
||||
url="https://conda.anaconda.org",
|
||||
channel="conda-forge",
|
||||
archs=["linux-64"],
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 1
|
||||
assert res.origins == 2
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
expected_origins = [
|
||||
@pytest.fixture
|
||||
def expected_origins():
|
||||
return [
|
||||
{
|
||||
"url": "https://anaconda.org/conda-forge/21cmfast",
|
||||
"artifacts": [
|
||||
|
@ -75,6 +53,33 @@ def test_conda_lister_conda_forge_channel(
|
|||
},
|
||||
]
|
||||
|
||||
|
||||
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 3
|
||||
assert res.origins == 11
|
||||
|
||||
|
||||
def test_conda_lister_conda_forge_channel(
|
||||
requests_mock_datadir, swh_scheduler, expected_origins
|
||||
):
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler,
|
||||
url="https://conda.anaconda.org",
|
||||
channel="conda-forge",
|
||||
archs=["linux-64"],
|
||||
)
|
||||
res = lister.run()
|
||||
|
||||
assert res.pages == 1
|
||||
assert res.origins == 2
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
|
||||
assert len(scheduler_origins) == len(expected_origins)
|
||||
|
||||
assert [
|
||||
|
@ -92,3 +97,23 @@ def test_conda_lister_conda_forge_channel(
|
|||
)
|
||||
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
|
||||
]
|
||||
|
||||
|
||||
def test_conda_lister_number_of_yielded_origins(
|
||||
requests_mock_datadir, swh_scheduler, expected_origins
|
||||
):
|
||||
"""Check that a single ListedOrigin instance is sent by expected origins."""
|
||||
lister = CondaLister(
|
||||
scheduler=swh_scheduler,
|
||||
url="https://conda.anaconda.org",
|
||||
channel="conda-forge",
|
||||
archs=["linux-64"],
|
||||
)
|
||||
|
||||
listed_origins = []
|
||||
for page in lister.get_pages():
|
||||
listed_origins += list(lister.get_origins_from_page(page))
|
||||
|
||||
assert sorted([listed_origin.url for listed_origin in listed_origins]) == sorted(
|
||||
[origin["url"] for origin in expected_origins]
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue