swh.lister.gitlab: Add the presence check for the incremental lister
This commit is contained in:
parent
d520891547
commit
b9544c77f4
2 changed files with 40 additions and 14 deletions
|
@ -150,6 +150,25 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
|
|||
"""
|
||||
return models_list
|
||||
|
||||
def do_additional_checks(self, models_list):
|
||||
"""Execute some additional checks on the model list. For example, to
|
||||
check for existing repositories in the db.
|
||||
|
||||
MAY BE OVERRIDDEN if an intermediate Lister class needs to
|
||||
check some more the results before injection.
|
||||
|
||||
Checks are fine by default, returns the models_list as is by default.
|
||||
|
||||
Args:
|
||||
models_list: list of dicts returned by
|
||||
transport_response_simplified.
|
||||
|
||||
Returns:
|
||||
models_list with entries if checks ok, False otherwise
|
||||
|
||||
"""
|
||||
return models_list
|
||||
|
||||
def is_within_bounds(self, inner, lower=None, upper=None):
|
||||
"""See if a sortable value is inside the range [lower,upper].
|
||||
|
||||
|
@ -453,18 +472,23 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
|
|||
[self.task_dict(m['origin_type'], m['origin_url'])]
|
||||
)[0]['id']
|
||||
|
||||
def ingest_data(self, identifier):
|
||||
def ingest_data(self, identifier, checks=False):
|
||||
"""The core data fetch sequence. Request server endpoint. Simplify and
|
||||
filter response list of repositories. Inject repo information into
|
||||
local db. Queue loader tasks for linked repositories.
|
||||
|
||||
Args:
|
||||
identifier: Resource identifier.
|
||||
checks (bool): Additional checks required
|
||||
"""
|
||||
# Request (partial?) list of repositories info
|
||||
response = self.safely_issue_request(identifier)
|
||||
models_list = self.transport_response_simplified(response)
|
||||
models_list = self.filter_before_inject(models_list)
|
||||
if checks:
|
||||
models_list = self.do_additional_checks(models_list)
|
||||
if not models_list:
|
||||
return response, []
|
||||
# inject into local db
|
||||
injected = self.inject_repo_data_into_db(models_list)
|
||||
# queue workers
|
||||
|
|
|
@ -79,14 +79,18 @@ class PageByPageLister(SWHListerBase):
|
|||
|
||||
# You probably don't need to override anything below this line.
|
||||
|
||||
def check_existence(self, injected_repos):
|
||||
"""Given a list of injected repos, check if we already have them.
|
||||
def do_additional_checks(self, models_list):
|
||||
"""Potentially check for existence of repositories in models_list.
|
||||
|
||||
Attribute 'instance' variable is assumed to be populated.
|
||||
This will be called only if check_existence is flipped on in
|
||||
the run method below.
|
||||
|
||||
"""
|
||||
# FIXME: Implement the check
|
||||
return False
|
||||
for m in models_list:
|
||||
sql_repo = self.db_query_equal('uid', m['uid'])
|
||||
if sql_repo:
|
||||
return False
|
||||
return models_list
|
||||
|
||||
def run(self, min_bound=None, max_bound=None, check_existence=False):
|
||||
"""Main entry function. Sequentially fetches repository data from the
|
||||
|
@ -111,16 +115,17 @@ class PageByPageLister(SWHListerBase):
|
|||
|
||||
self.min_page = min_bound
|
||||
self.max_page = max_bound
|
||||
already_seen = False
|
||||
|
||||
while self.is_within_bounds(page, self.min_page, self.max_page):
|
||||
logging.info('listing repos starting at %s' % page)
|
||||
|
||||
response, injected_repos = self.ingest_data(page)
|
||||
next_page = self.get_next_target_from_response(response)
|
||||
response, injected_repos = self.ingest_data(page,
|
||||
checks=check_existence)
|
||||
if not injected_repos:
|
||||
logging.info('Repositories already seen, stopping')
|
||||
break
|
||||
|
||||
if check_existence:
|
||||
already_seen = self.check_existence(injected_repos)
|
||||
next_page = self.get_next_target_from_response(response)
|
||||
|
||||
# termination condition
|
||||
|
||||
|
@ -128,9 +133,6 @@ class PageByPageLister(SWHListerBase):
|
|||
logging.info('stopping after page %s, no next link found' %
|
||||
page)
|
||||
break
|
||||
elif already_seen:
|
||||
logging.info('Repositories already seen, stopping')
|
||||
break
|
||||
else:
|
||||
page = next_page
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue