simple_lister: Flush to db more frequently

This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-11-15 11:47:48 +01:00
parent bf030c0f00
commit 8d02458686
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8

View file

@ -11,6 +11,9 @@ from swh.core import utils
from .lister_base import ListerBase
logger = logging.getLogger(__name__)
class SimpleLister(ListerBase):
"""Lister* intermediate class for any service that follows the simple,
'list in oneshot information' pattern.
@ -21,6 +24,11 @@ class SimpleLister(ListerBase):
information and stores those in db
"""
flush_packet_db = 2
"""Number of iterations in-between write flushes of lister repositories to
db (see fn:`ingest_data`).
"""
def list_packages(self, response: Any) -> List[Any]:
"""Listing packages method.
@ -47,7 +55,7 @@ class SimpleLister(ListerBase):
models_list = self.transport_response_simplified(response)
models_list = self.filter_before_inject(models_list)
all_injected = []
for models in utils.grouper(models_list, n=1000):
for i, models in enumerate(utils.grouper(models_list, n=100), start=1):
models = list(models)
logging.debug('models: %s' % len(models))
# inject into local db
@ -55,9 +63,10 @@ class SimpleLister(ListerBase):
# queue workers
self.schedule_missing_tasks(models, injected)
all_injected.append(injected)
# flush
self.db_session.commit()
self.db_session = self.mk_session()
if (i % self.flush_packet_db) == 0:
logger.debug('Flushing updates at index %s', i)
self.db_session.commit()
self.db_session = self.mk_session()
return response, all_injected