From bd11830328591339dac08f15eac90f010c7747b8 Mon Sep 17 00:00:00 2001 From: David Douard Date: Wed, 4 Sep 2019 11:04:19 +0200 Subject: [PATCH] cgit: reduce the batch size to 10 and add a bit of logging Since the CGit lister now perform an HTTP query for each git repos listed in the main index, it is significantly slower, so reducing the time between database commits make sense, and won't overload the database. With a bit of logging, it makes it easier to follow/debug the progress of a listing. --- swh/lister/cgit/lister.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 3b8b091..c459eb5 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -3,6 +3,7 @@ # See top-level LICENSE file for more information import re +import logging from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup @@ -15,6 +16,9 @@ from swh.core.utils import grouper from swh.lister.core.lister_base import ListerBase +logger = logging.getLogger(__name__) + + class CGitLister(ListerBase): """Lister class for CGit repositories. @@ -70,12 +74,15 @@ class CGitLister(ListerBase): self.session.mount(self.url, HTTPAdapter(max_retries=3)) def run(self): - for repos in grouper(self.get_repos(), 100): + total = 0 + for repos in grouper(self.get_repos(), 10): models = list(filter(None, (self.build_model(repo) for repo in repos))) injected_repos = self.inject_repo_data_into_db(models) self.schedule_missing_tasks(models, injected_repos) self.db_session.commit() + total += len(injected_repos) + logger.debug('Scheduled %s tasks for %s', total, self.url) def get_repos(self): """Generate git 'project' URLs found on the current CGit server