swh.lister.gitlab: Add Incremental lister behavior
Related T989
This commit is contained in:
parent
ccd0525c9b
commit
847a8d341a
3 changed files with 43 additions and 4 deletions
|
@ -79,7 +79,14 @@ class PageByPageLister(SWHListerBase):
|
|||
|
||||
# You probably don't need to override anything below this line.
|
||||
|
||||
def run(self, min_bound=None, max_bound=None):
|
||||
def check_existence(self, injected_repos):
|
||||
"""Given a list of injected repos, check if we already have them.
|
||||
|
||||
"""
|
||||
# FIXME: Implement the check
|
||||
return False
|
||||
|
||||
def run(self, min_bound=None, max_bound=None, check_existence=False):
|
||||
"""Main entry function. Sequentially fetches repository data from the
|
||||
service according to the basic outline in the class
|
||||
docstring. Continually fetching sublists until either there
|
||||
|
@ -89,6 +96,9 @@ class PageByPageLister(SWHListerBase):
|
|||
Args:
|
||||
min_bound: optional page to start from
|
||||
max_bound: optional page to stop at
|
||||
check_existence (bool): optional existence check (for
|
||||
incremental lister whose sort
|
||||
order is inverted)
|
||||
|
||||
Returns:
|
||||
nothing
|
||||
|
@ -99,6 +109,7 @@ class PageByPageLister(SWHListerBase):
|
|||
|
||||
self.min_page = min_bound
|
||||
self.max_page = max_bound
|
||||
already_seen = False
|
||||
|
||||
while self.is_within_bounds(page, self.min_page, self.max_page):
|
||||
logging.info('listing repos starting at %s' % page)
|
||||
|
@ -106,12 +117,18 @@ class PageByPageLister(SWHListerBase):
|
|||
response, injected_repos = self.ingest_data(page)
|
||||
next_page = self.get_next_target_from_response(response)
|
||||
|
||||
if check_existence:
|
||||
already_seen = self.check_existence(injected_repos)
|
||||
|
||||
# termination condition
|
||||
|
||||
if (next_page is None) or (next_page == page):
|
||||
logging.info('stopping after page %s, no next link found' %
|
||||
page)
|
||||
break
|
||||
elif already_seen:
|
||||
logging.info('Repositories already seen, stopping')
|
||||
break
|
||||
else:
|
||||
page = next_page
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ from .models import GitLabModel
|
|||
|
||||
class GitLabLister(PageByPageHttpLister):
|
||||
# Template path expecting an integer that represents the page id
|
||||
PATH_TEMPLATE = '/projects?page=%d&order_by=id&sort=asc&simple=true'
|
||||
PATH_TEMPLATE = '/projects?page=%d&order_by=id'
|
||||
API_URL_INDEX_RE = re.compile(r'^.*/projects.*page=(\d+).*')
|
||||
MODEL = GitLabModel
|
||||
LISTER_NAME = 'gitlab'
|
||||
|
@ -103,7 +103,7 @@ class GitLabLister(PageByPageHttpLister):
|
|||
return None
|
||||
|
||||
def get_pages_information(self):
|
||||
"""Determine some pages information.
|
||||
"""Determine pages information.
|
||||
|
||||
"""
|
||||
response = self.transport_head(identifier=1)
|
||||
|
|
|
@ -17,13 +17,16 @@ class GitLabListerTask(ListerTaskBase):
|
|||
|
||||
|
||||
class RangeGitLabLister(GitLabListerTask, RangeListerTask):
|
||||
"""GitLab lister working on specified range (start, end) arguments.
|
||||
"""Range GitLab lister (list available origins on specified range)
|
||||
|
||||
"""
|
||||
task_queue = 'swh_lister_gitlab_refresh'
|
||||
|
||||
|
||||
class FullGitLabRelister(GitLabListerTask):
|
||||
"""Full GitLab lister (list all available origins from the api).
|
||||
|
||||
"""
|
||||
task_queue = 'swh_lister_gitlab_refresh'
|
||||
|
||||
def run_task(self, *args, **kwargs):
|
||||
|
@ -41,3 +44,22 @@ class FullGitLabRelister(GitLabListerTask):
|
|||
range_task = RangeGitLabLister()
|
||||
group(range_task.s(minv, maxv, *args, **kwargs)
|
||||
for minv, maxv in ranges)()
|
||||
|
||||
|
||||
class IncrementalGitLabLister(ListerTaskBase):
|
||||
"""Incremental GitLab lister (list only new available origins).
|
||||
|
||||
"""
|
||||
task_queue = 'swh_lister_gitlab_discover'
|
||||
|
||||
def new_lister(self, api_baseurl='https://gitlab.com/api/v4',
|
||||
instance='gitlab.com',):
|
||||
# will invert the order of the lister's result
|
||||
return GitLabLister(instance=instance, api_baseurl=api_baseurl,
|
||||
sort='desc')
|
||||
|
||||
def run_task(self, *args, **kwargs):
|
||||
lister = self.new_lister(*args, **kwargs)
|
||||
# will check for existing data and exit when found
|
||||
return lister.run(min_bound=None, max_bound=None,
|
||||
check_existence=True)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue