packagist: Yield pages of origins to regularly record origins
Instead of sending one page with all origins listed which is britle. When something goes wrong during the listing, the lister currently records nothing.
This commit is contained in:
parent
15a4c4cdb4
commit
b02144b4f9
2 changed files with 5 additions and 4 deletions
|
@ -13,6 +13,7 @@ import iso8601
|
|||
import requests
|
||||
from tenacity import RetryError
|
||||
|
||||
from swh.core.utils import grouper
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -126,12 +127,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
return {}
|
||||
|
||||
def get_pages(self) -> Iterator[PackagistPageType]:
|
||||
"""
|
||||
Yield a single page listing all Packagist projects (randomly).
|
||||
"""
|
||||
"""Retrieve & randomize unique list of packages into pages of packages."""
|
||||
package_names = self.api_request(self.url)["packageNames"]
|
||||
shuffle(package_names)
|
||||
yield package_names
|
||||
for page_packages in grouper(package_names, n=self.record_batch_size):
|
||||
yield page_packages
|
||||
|
||||
def _get_metadata_from_page(
|
||||
self, package_url_format: str, package_name: str
|
||||
|
|
|
@ -356,6 +356,7 @@ class Lister(Generic[StateType, PageType]):
|
|||
else:
|
||||
logger.warning("Skipping invalid origin: %s", origin.url)
|
||||
|
||||
logger.debug("Record valid %s origins in the scheduler", len(valid_origins))
|
||||
ret = self.scheduler.record_listed_origins(valid_origins)
|
||||
recorded_origins.extend(origin.url for origin in ret)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue