cran: Prevent multiple listing of an origin

A CRAN package can appear twice in the JSON list returned by the
list_all_packages.R script, most recent version of the package
appearing first.

So handle that edge case to avoid error when sending origins to
the scheduler.
This commit is contained in:
Antoine Lambert 2021-02-05 12:51:20 +01:00
parent b4c4c20bb9
commit 1803b707e4
2 changed files with 25 additions and 0 deletions

View file

@ -45,9 +45,18 @@ class CRANLister(StatelessLister[PageType]):
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None
seen_urls = set()
for package_info in page:
origin_url, artifact_url = compute_origin_urls(package_info)
if origin_url in seen_urls:
# prevent multiple listing of an origin,
# most recent version will be listed first
continue
seen_urls.add(origin_url)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,

View file

@ -105,6 +105,22 @@ def test_cran_lister_cran(datadir, swh_scheduler, mocker):
filtered_origins[0].last_update == parse_packaged_date(package_info)
def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
with open(path.join(datadir, "list-r-packages.json")) as f:
cran_data = json.loads(f.read())
lister = CRANLister(swh_scheduler)
mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
mock_cran.return_value = cran_data + cran_data
stats = lister.run()
assert stats.pages == 1
assert stats.origins == len(cran_data)
@pytest.mark.parametrize(
"credentials, expected_credentials",
[