Puppet: Lister implements incremental mode

Use with_release_since api argument to retrieve modules that have been
updated since the last date the lister has been executed.

Related T4519
This commit is contained in:
Franck Bret 2022-10-26 10:09:03 +02:00
parent e8699422d7
commit e1f3f87c73
4 changed files with 382 additions and 6 deletions

View file

@ -22,6 +22,10 @@ It returns a paginated list of results and a `next` url.
The api follow `OpenApi 3.0 specifications`.
The lister is incremental using ``with_release_since`` api argument whose value is an
iso date set regarding the last time the lister has been executed, stored as
``lister.state.last_listing_date``.
Page listing
------------

View file

@ -3,15 +3,18 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -19,7 +22,15 @@ logger = logging.getLogger(__name__)
PuppetListerPage = List[Dict[str, Any]]
class PuppetLister(StatelessLister[PuppetListerPage]):
@dataclass
class PuppetListerState:
"""Store lister state for incremental mode operations"""
last_listing_date: Optional[datetime] = None
"""Last date when Puppet lister was executed"""
class PuppetLister(Lister[PuppetListerState, PuppetListerPage]):
"""The Puppet lister list origins from 'Puppet Forge'"""
LISTER_NAME = "puppet"
@ -39,6 +50,21 @@ class PuppetLister(StatelessLister[PuppetListerPage]):
instance=self.INSTANCE,
url=self.BASE_URL,
)
# Store the datetime the lister runs for incremental purpose
self.listing_date = datetime.now()
def state_from_dict(self, d: Dict[str, Any]) -> PuppetListerState:
last_listing_date = d.get("last_listing_date")
if last_listing_date is not None:
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
return PuppetListerState(**d)
def state_to_dict(self, state: PuppetListerState) -> Dict[str, Any]:
d: Dict[str, Optional[str]] = {"last_listing_date": None}
last_listing_date = state.last_listing_date
if last_listing_date is not None:
d["last_listing_date"] = last_listing_date.isoformat()
return d
def get_pages(self) -> Iterator[PuppetListerPage]:
"""Yield an iterator which returns 'page'
@ -52,9 +78,21 @@ class PuppetLister(StatelessLister[PuppetListerPage]):
"""
# limit = 100 is the max value for pagination
limit: int = 100
response = self.http_request(
f"{self.BASE_URL}v3/modules", params={"limit": limit}
)
params: Dict[str, Any] = {"limit": limit}
if self.state.last_listing_date:
# Incremental mode filter query
# To ensure we don't miss records between two lister runs `last_str`` must be
# set with an offset of -15 hours, which is the lower timezone recorded in the
# tzdb
last_str = (
self.state.last_listing_date.astimezone(timezone(timedelta(hours=-15)))
.date()
.isoformat()
)
params["with_release_since"] = last_str
response = self.http_request(f"{self.BASE_URL}v3/modules", params=params)
data: Dict[str, Any] = response.json()
yield data["results"]
@ -111,3 +149,7 @@ class PuppetLister(StatelessLister[PuppetListerPage]):
last_update=last_update,
extra_loader_arguments={"artifacts": artifacts},
)
def finalize(self) -> None:
self.state.last_listing_date = self.listing_date
self.updated = True

File diff suppressed because one or more lines are too long

View file

@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
from swh.lister.puppet.lister import PuppetLister
# flake8: noqa: B950
@ -118,3 +120,45 @@ def test_puppet_lister(datadir, requests_mock_datadir, swh_scheduler):
)
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
]
def test_puppet_lister_incremental(datadir, requests_mock_datadir, swh_scheduler):
# First run
lister = PuppetLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 2
assert res.origins == 1 + 1 + 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert [
(
scheduled.visit_type,
scheduled.url,
scheduled.extra_loader_arguments["artifacts"],
)
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
] == [
(
"puppet",
expected["url"],
expected["artifacts"],
)
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
]
# Second run
lister = PuppetLister(scheduler=swh_scheduler)
# Force lister.state.last_listing_date for correct fixture loading
lister.state.last_listing_date = datetime(2022, 9, 26, 18, 0).astimezone(
timezone(timedelta(hours=-7))
)
res = lister.run()
assert res.pages == 1
assert res.origins == 1