cgit: reduce the batch size to 10 and add a bit of logging

Since the CGit lister now perform an HTTP query for each git repos listed in
the main index, it is significantly slower, so reducing the time between
database commits make sense, and won't overload the database.

With a bit of logging, it makes it easier to follow/debug the progress of
a listing.
This commit is contained in:
David Douard 2019-09-04 11:04:19 +02:00
parent 8d9deeb8f8
commit bd11830328

View file

@ -3,6 +3,7 @@
# See top-level LICENSE file for more information
import re
import logging
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
@ -15,6 +16,9 @@ from swh.core.utils import grouper
from swh.lister.core.lister_base import ListerBase
logger = logging.getLogger(__name__)
class CGitLister(ListerBase):
"""Lister class for CGit repositories.
@ -70,12 +74,15 @@ class CGitLister(ListerBase):
self.session.mount(self.url, HTTPAdapter(max_retries=3))
def run(self):
for repos in grouper(self.get_repos(), 100):
total = 0
for repos in grouper(self.get_repos(), 10):
models = list(filter(None, (self.build_model(repo)
for repo in repos)))
injected_repos = self.inject_repo_data_into_db(models)
self.schedule_missing_tasks(models, injected_repos)
self.db_session.commit()
total += len(injected_repos)
logger.debug('Scheduled %s tasks for %s', total, self.url)
def get_repos(self):
"""Generate git 'project' URLs found on the current CGit server