swh.lister.gitlab: Add the presence check for the incremental lister

This commit is contained in:
Antoine R. Dumont (@ardumont) 2018-07-11 19:15:54 +02:00
parent d520891547
commit b9544c77f4
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
2 changed files with 40 additions and 14 deletions

View file

@ -150,6 +150,25 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
"""
return models_list
def do_additional_checks(self, models_list):
"""Execute some additional checks on the model list. For example, to
check for existing repositories in the db.
MAY BE OVERRIDDEN if an intermediate Lister class needs to
check some more the results before injection.
Checks are fine by default, returns the models_list as is by default.
Args:
models_list: list of dicts returned by
transport_response_simplified.
Returns:
models_list with entries if checks ok, False otherwise
"""
return models_list
def is_within_bounds(self, inner, lower=None, upper=None):
"""See if a sortable value is inside the range [lower,upper].
@ -453,18 +472,23 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
[self.task_dict(m['origin_type'], m['origin_url'])]
)[0]['id']
def ingest_data(self, identifier):
def ingest_data(self, identifier, checks=False):
"""The core data fetch sequence. Request server endpoint. Simplify and
filter response list of repositories. Inject repo information into
local db. Queue loader tasks for linked repositories.
Args:
identifier: Resource identifier.
checks (bool): Additional checks required
"""
# Request (partial?) list of repositories info
response = self.safely_issue_request(identifier)
models_list = self.transport_response_simplified(response)
models_list = self.filter_before_inject(models_list)
if checks:
models_list = self.do_additional_checks(models_list)
if not models_list:
return response, []
# inject into local db
injected = self.inject_repo_data_into_db(models_list)
# queue workers

View file

@ -79,14 +79,18 @@ class PageByPageLister(SWHListerBase):
# You probably don't need to override anything below this line.
def check_existence(self, injected_repos):
"""Given a list of injected repos, check if we already have them.
def do_additional_checks(self, models_list):
"""Potentially check for existence of repositories in models_list.
Attribute 'instance' variable is assumed to be populated.
This will be called only if check_existence is flipped on in
the run method below.
"""
# FIXME: Implement the check
return False
for m in models_list:
sql_repo = self.db_query_equal('uid', m['uid'])
if sql_repo:
return False
return models_list
def run(self, min_bound=None, max_bound=None, check_existence=False):
"""Main entry function. Sequentially fetches repository data from the
@ -111,16 +115,17 @@ class PageByPageLister(SWHListerBase):
self.min_page = min_bound
self.max_page = max_bound
already_seen = False
while self.is_within_bounds(page, self.min_page, self.max_page):
logging.info('listing repos starting at %s' % page)
response, injected_repos = self.ingest_data(page)
next_page = self.get_next_target_from_response(response)
response, injected_repos = self.ingest_data(page,
checks=check_existence)
if not injected_repos:
logging.info('Repositories already seen, stopping')
break
if check_existence:
already_seen = self.check_existence(injected_repos)
next_page = self.get_next_target_from_response(response)
# termination condition
@ -128,9 +133,6 @@ class PageByPageLister(SWHListerBase):
logging.info('stopping after page %s, no next link found' %
page)
break
elif already_seen:
logging.info('Repositories already seen, stopping')
break
else:
page = next_page