packagist: Yield pages of origins to regularly record origins

Instead of sending one page with all origins listed which is britle.
When something goes wrong during the listing, the lister currently records nothing.
This commit is contained in:
Antoine R. Dumont (@ardumont) 2023-08-03 16:12:13 +02:00
parent 15a4c4cdb4
commit b02144b4f9
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
2 changed files with 5 additions and 4 deletions

View file

@ -13,6 +13,7 @@ import iso8601
import requests
from tenacity import RetryError
from swh.core.utils import grouper
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -126,12 +127,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
return {}
def get_pages(self) -> Iterator[PackagistPageType]:
"""
Yield a single page listing all Packagist projects (randomly).
"""
"""Retrieve & randomize unique list of packages into pages of packages."""
package_names = self.api_request(self.url)["packageNames"]
shuffle(package_names)
yield package_names
for page_packages in grouper(package_names, n=self.record_batch_size):
yield page_packages
def _get_metadata_from_page(
self, package_url_format: str, package_name: str

View file

@ -356,6 +356,7 @@ class Lister(Generic[StateType, PageType]):
else:
logger.warning("Skipping invalid origin: %s", origin.url)
logger.debug("Record valid %s origins in the scheduler", len(valid_origins))
ret = self.scheduler.record_listed_origins(valid_origins)
recorded_origins.extend(origin.url for origin in ret)