cgit: reduce the batch size to 10 and add a bit of logging
Since the CGit lister now perform an HTTP query for each git repos listed in the main index, it is significantly slower, so reducing the time between database commits make sense, and won't overload the database. With a bit of logging, it makes it easier to follow/debug the progress of a listing.
This commit is contained in:
parent
8d9deeb8f8
commit
bd11830328
1 changed files with 8 additions and 1 deletions
|
@ -3,6 +3,7 @@
|
|||
# See top-level LICENSE file for more information
|
||||
|
||||
import re
|
||||
import logging
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -15,6 +16,9 @@ from swh.core.utils import grouper
|
|||
from swh.lister.core.lister_base import ListerBase
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CGitLister(ListerBase):
|
||||
"""Lister class for CGit repositories.
|
||||
|
||||
|
@ -70,12 +74,15 @@ class CGitLister(ListerBase):
|
|||
self.session.mount(self.url, HTTPAdapter(max_retries=3))
|
||||
|
||||
def run(self):
|
||||
for repos in grouper(self.get_repos(), 100):
|
||||
total = 0
|
||||
for repos in grouper(self.get_repos(), 10):
|
||||
models = list(filter(None, (self.build_model(repo)
|
||||
for repo in repos)))
|
||||
injected_repos = self.inject_repo_data_into_db(models)
|
||||
self.schedule_missing_tasks(models, injected_repos)
|
||||
self.db_session.commit()
|
||||
total += len(injected_repos)
|
||||
logger.debug('Scheduled %s tasks for %s', total, self.url)
|
||||
|
||||
def get_repos(self):
|
||||
"""Generate git 'project' URLs found on the current CGit server
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue