From 0ad37740d9d7cfa4a7d75f5c8d5d7568396c1abf Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Thu, 28 Jan 2021 16:51:56 +0100 Subject: [PATCH] pattern: Make lister flush regularly origins to scheduler As origins is a generator, the previous behavior would try to consume the overall generator to send the records. This groups and sends batch of 100 origins to the scheduler for writing. Related to T3003 --- swh/lister/pattern.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 3c491f7..4f541fb 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar from swh.core.config import load_from_envvar +from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface @@ -221,8 +222,12 @@ class Lister(Generic[StateType, PageType]): Returns: the number of listed origins recorded in the scheduler """ - ret = self.scheduler.record_listed_origins(origins) - return len(ret) + count = 0 + for batch_origins in grouper(origins, n=100): + ret = self.scheduler.record_listed_origins(batch_origins) + count += len(ret) + + return count @classmethod def from_config(cls, scheduler: Dict[str, Any], **config: Any):