From c6aa490fc11249da3b6224579a84f9309f4486e1 Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Wed, 21 Aug 2024 16:12:37 +0200 Subject: [PATCH] crates: Record lister state only if all crates were processed Previously, the lister state was recorded regardless if errors occurred when listing crates as the finalize method is called regardless of raised exception during listing. As a consequence some crates could be missed as the incremental listing restarts from the dump date of the last processed crate database. So ensure all crates have been processed by the lister before recording its state. --- swh/lister/crates/lister.py | 7 ++++--- swh/lister/crates/tests/test_lister.py | 17 ++++++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index e31756c..0e41be7 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -82,6 +82,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): enable_origins=enable_origins, ) self.index_metadata: Dict[str, str] = {} + self.all_crates_processed = False def state_from_dict(self, d: Dict[str, Any]) -> CratesListerState: index_last_update = d.get("index_last_update") @@ -210,6 +211,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): page.append(self.page_entry_dict(v)) yield page + self.all_crates_processed = True def get_origins_from_page(self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" @@ -255,8 +257,7 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): ) def finalize(self) -> None: - last: datetime = iso8601.parse_date(self.index_metadata["timestamp"]) - - if not self.state.index_last_update: + if not self.state.index_last_update and self.all_crates_processed: + last = iso8601.parse_date(self.index_metadata["timestamp"]) self.state.index_last_update = last self.updated = True diff --git a/swh/lister/crates/tests/test_lister.py b/swh/lister/crates/tests/test_lister.py index ebc9220..82a41a6 100644 --- a/swh/lister/crates/tests/test_lister.py +++ b/swh/lister/crates/tests/test_lister.py @@ -1,8 +1,10 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import iso8601 +import pytest from swh.lister.crates.lister import CratesLister, CratesListerState @@ -250,3 +252,16 @@ def test_crates_lister_incremental_nothing_new( assert res.pages == 0 assert res.origins == 0 + + +def test_crates_lister_error_when_processing_crate( + swh_scheduler, requests_mock_datadir, mocker +): + """Lister state should not be recorded to scheduler is an error occurred + when processing crate data.""" + lister = CratesLister(scheduler=swh_scheduler) + mocker.patch.object(lister, "page_entry_dict").side_effect = IndexError() + with pytest.raises(IndexError): + lister.run() + + assert lister.get_state_from_scheduler().index_last_update is None