cran: Prevent multiple listing of an origin
A CRAN package can appear twice in the JSON list returned by the list_all_packages.R script, most recent version of the package appearing first. So handle that edge case to avoid error when sending origins to the scheduler.
This commit is contained in:
parent
b4c4c20bb9
commit
1803b707e4
2 changed files with 25 additions and 0 deletions
|
@ -45,9 +45,18 @@ class CRANLister(StatelessLister[PageType]):
|
|||
|
||||
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
seen_urls = set()
|
||||
for package_info in page:
|
||||
origin_url, artifact_url = compute_origin_urls(package_info)
|
||||
|
||||
if origin_url in seen_urls:
|
||||
# prevent multiple listing of an origin,
|
||||
# most recent version will be listed first
|
||||
continue
|
||||
|
||||
seen_urls.add(origin_url)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
|
|
|
@ -105,6 +105,22 @@ def test_cran_lister_cran(datadir, swh_scheduler, mocker):
|
|||
filtered_origins[0].last_update == parse_packaged_date(package_info)
|
||||
|
||||
|
||||
def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
|
||||
with open(path.join(datadir, "list-r-packages.json")) as f:
|
||||
cran_data = json.loads(f.read())
|
||||
|
||||
lister = CRANLister(swh_scheduler)
|
||||
|
||||
mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
|
||||
|
||||
mock_cran.return_value = cran_data + cran_data
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
assert stats.pages == 1
|
||||
assert stats.origins == len(cran_data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"credentials, expected_credentials",
|
||||
[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue