tasks: normalize the url argument name of most lister

Since all the listing tasks accepts an url as first argument (whatever the
argument name is), it makes sense to use a simple common argument name for
this. I've chosen 'url' instead of api_baseurl/forge_url/url.

Also kill now useless `new_lister()` functions.
This commit is contained in:
David Douard 2019-09-04 11:19:16 +02:00
parent 631b8e7668
commit b810876ef8
19 changed files with 60 additions and 91 deletions

View file

@ -23,9 +23,8 @@ class BitBucketLister(IndexingHttpLister):
instance = 'bitbucket'
default_min_bound = datetime.utcfromtimestamp(0)
def __init__(self, api_baseurl=None, override_config=None, per_page=100):
super().__init__(
api_baseurl=api_baseurl, override_config=override_config)
def __init__(self, url=None, override_config=None, per_page=100):
super().__init__(url=url, override_config=override_config)
per_page = self.config.get('per_page', per_page)
self.PATH_TEMPLATE = '%s&pagelen=%s' % (

View file

@ -12,20 +12,16 @@ from .lister import BitBucketLister
GROUP_SPLIT = 10000
def new_lister(api_baseurl='https://api.bitbucket.org/2.0', per_page=100):
return BitBucketLister(api_baseurl=api_baseurl, per_page=per_page)
@app.task(name=__name__ + '.IncrementalBitBucketLister')
def list_bitbucket_incremental(**lister_args):
'''Incremental update of the BitBucket forge'''
lister = new_lister(**lister_args)
lister = BitBucketLister(**lister_args)
lister.run(min_bound=lister.db_last_index(), max_bound=None)
@app.task(name=__name__ + '.RangeBitBucketLister')
def _range_bitbucket_lister(start, end, **lister_args):
lister = new_lister(**lister_args)
lister = BitBucketLister(**lister_args)
lister.run(min_bound=start, max_bound=end)
@ -36,7 +32,7 @@ def list_bitbucket_full(self, split=None, **lister_args):
It's not to be called for an initial listing.
"""
lister = new_lister(**lister_args)
lister = BitBucketLister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')

View file

@ -26,8 +26,7 @@ def test_incremental(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(
api_baseurl='https://api.bitbucket.org/2.0', per_page=100)
lister.assert_called_once_with()
lister.db_last_index.assert_called_once_with()
lister.run.assert_called_once_with(min_bound=42, max_bound=None)
@ -45,8 +44,7 @@ def test_range(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(
api_baseurl='https://api.bitbucket.org/2.0', per_page=100)
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@ -76,8 +74,7 @@ def test_relister(lister, swh_app, celery_session_worker):
break
sleep(1)
lister.assert_called_with(
api_baseurl='https://api.bitbucket.org/2.0', per_page=100)
lister.assert_called_with()
# one by the FullBitbucketRelister task
# + 5 for the RangeBitbucketLister subtasks

View file

@ -63,7 +63,7 @@ def get_lister(lister_name, db_url=None, **conf):
(lister_name, SUPPORTED_LISTERS))
if db_url:
conf['lister'] = {'cls': 'local', 'args': {'db': db_url}}
# To allow api_baseurl override per lister
registry_entry = LISTERS[lister_name].load()()
lister_cls = registry_entry['lister']
lister = lister_cls(override_config=conf)

View file

@ -244,6 +244,6 @@ class IndexingLister(ListerBase):
class IndexingHttpLister(ListerHttpTransport, IndexingLister):
"""Convenience class for ensuring right lookup and init order
when combining IndexingLister and ListerHttpTransport."""
def __init__(self, api_baseurl=None, override_config=None):
def __init__(self, url=None, override_config=None):
IndexingLister.__init__(self, override_config=override_config)
ListerHttpTransport.__init__(self, api_baseurl=api_baseurl)
ListerHttpTransport.__init__(self, url=url)

View file

@ -96,7 +96,7 @@ class ListerHttpTransport(abc.ABC):
required.
"""
path = self.PATH_TEMPLATE % identifier
return self.api_baseurl + path
return self.url + path
def request_params(self, identifier):
"""Get the full parameters passed to requests given the
@ -142,14 +142,14 @@ class ListerHttpTransport(abc.ABC):
self.reset_backoff()
return False, 0
def __init__(self, api_baseurl=None):
if not api_baseurl:
api_baseurl = self.config.get('api_baseurl')
if not api_baseurl:
api_baseurl = self.DEFAULT_URL
if not api_baseurl:
raise NameError('HTTP Lister Transport requires api_baseurl.')
self.api_baseurl = api_baseurl # eg. 'https://api.github.com'
def __init__(self, url=None):
if not url:
url = self.config.get('url')
if not url:
url = self.DEFAULT_URL
if not url:
raise NameError('HTTP Lister Transport requires an url.')
self.url = url # eg. 'https://api.github.com'
self.session = requests.Session()
self.lister_version = __version__
@ -218,7 +218,7 @@ class ListerOnePageApiTransport(ListerHttpTransport):
"parse for information")
PATH_TEMPLATE = None # we do not use it
def __init__(self, api_baseurl=None):
def __init__(self, url=None):
self.session = requests.Session()
self.lister_version = __version__

View file

@ -155,6 +155,6 @@ class PageByPageHttpLister(ListerHttpTransport, PageByPageLister):
combining PageByPageLister and ListerHttpTransport.
"""
def __init__(self, api_baseurl=None, override_config=None):
def __init__(self, url=None, override_config=None):
PageByPageLister.__init__(self, override_config=override_config)
ListerHttpTransport.__init__(self, api_baseurl=api_baseurl)
ListerHttpTransport.__init__(self, url=url)

View file

@ -77,7 +77,7 @@ class HttpListerTesterBase(abc.ABC):
"""
if override_config or self.fl is None:
self.fl = self.Lister(api_baseurl='https://fakeurl',
self.fl = self.Lister(url='https://fakeurl',
override_config=override_config)
self.fl.INITIAL_BACKOFF = 1

View file

@ -35,7 +35,7 @@ class DebianLister(ListerHttpTransport, ListerBase):
instance = 'debian'
def __init__(self, override_config=None):
ListerHttpTransport.__init__(self, api_baseurl="bogus")
ListerHttpTransport.__init__(self, url="notused")
ListerBase.__init__(self, override_config=override_config)
def transport_request(self, identifier):

View file

@ -12,20 +12,16 @@ from swh.lister.github.lister import GitHubLister
GROUP_SPLIT = 10000
def new_lister(api_baseurl='https://api.github.com', **kw):
return GitHubLister(api_baseurl=api_baseurl, **kw)
@app.task(name=__name__ + '.IncrementalGitHubLister')
def list_github_incremental(**lister_args):
'Incremental update of GitHub'
lister = new_lister(**lister_args)
lister = GitHubLister(**lister_args)
lister.run(min_bound=lister.db_last_index(), max_bound=None)
@app.task(name=__name__ + '.RangeGitHubLister')
def _range_github_lister(start, end, **lister_args):
lister = new_lister(**lister_args)
lister = GitHubLister(**lister_args)
lister.run(min_bound=start, max_bound=end)
@ -36,7 +32,7 @@ def list_github_full(self, split=None, **lister_args):
It's not to be called for an initial listing.
"""
lister = new_lister(**lister_args)
lister = GitHubLister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')

View file

@ -26,7 +26,7 @@ def test_incremental(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(api_baseurl='https://api.github.com')
lister.assert_called_once_with()
lister.db_last_index.assert_called_once_with()
lister.run.assert_called_once_with(min_bound=42, max_bound=None)
@ -44,7 +44,7 @@ def test_range(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(api_baseurl='https://api.github.com')
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@ -74,7 +74,7 @@ def test_relister(lister, swh_app, celery_session_worker):
break
sleep(1)
lister.assert_called_with(api_baseurl='https://api.github.com')
lister.assert_called_with()
# one by the FullGitHubRelister task
# + 5 for the RangeGitHubLister subtasks

View file

@ -16,12 +16,11 @@ class GitLabLister(PageByPageHttpLister):
MODEL = GitLabModel
LISTER_NAME = 'gitlab'
def __init__(self, api_baseurl=None, instance=None,
def __init__(self, url=None, instance=None,
override_config=None, sort='asc', per_page=20):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
super().__init__(url=url, override_config=override_config)
if instance is None:
instance = parse_url(self.api_baseurl).host
instance = parse_url(self.url).host
self.instance = instance
self.PATH_TEMPLATE = '%s&sort=%s&per_page=%s' % (
self.PATH_TEMPLATE, sort, per_page)

View file

@ -14,18 +14,11 @@ from .lister import GitLabLister
NBPAGES = 10
def new_lister(api_baseurl='https://gitlab.com/api/v4',
instance=None, sort='asc', per_page=20):
return GitLabLister(
api_baseurl=api_baseurl, instance=instance, sort=sort,
per_page=per_page)
@app.task(name=__name__ + '.IncrementalGitLabLister')
def list_gitlab_incremental(**lister_args):
"""Incremental update of a GitLab instance"""
lister_args['sort'] = 'desc'
lister = new_lister(**lister_args)
lister = GitLabLister(**lister_args)
total_pages = lister.get_pages_information()[1]
# stopping as soon as existing origins for that instance are detected
lister.run(min_bound=1, max_bound=total_pages, check_existence=True)
@ -33,14 +26,14 @@ def list_gitlab_incremental(**lister_args):
@app.task(name=__name__ + '.RangeGitLabLister')
def _range_gitlab_lister(start, end, **lister_args):
lister = new_lister(**lister_args)
lister = GitLabLister(**lister_args)
lister.run(min_bound=start, max_bound=end)
@app.task(name=__name__ + '.FullGitLabRelister', bind=True)
def list_gitlab_full(self, **lister_args):
"""Full update of a GitLab instance"""
lister = new_lister(**lister_args)
lister = GitLabLister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, NBPAGES))
random.shuffle(ranges)

View file

@ -26,9 +26,7 @@ def test_incremental(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(
api_baseurl='https://gitlab.com/api/v4',
instance=None, sort='desc', per_page=20)
lister.assert_called_once_with(sort='desc')
lister.db_last_index.assert_not_called()
lister.get_pages_information.assert_called_once_with()
lister.run.assert_called_once_with(
@ -48,9 +46,7 @@ def test_range(lister, swh_app, celery_session_worker):
res.wait()
assert res.successful()
lister.assert_called_once_with(
api_baseurl='https://gitlab.com/api/v4',
instance=None, sort='asc', per_page=20)
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with(min_bound=12, max_bound=42)
@ -81,9 +77,7 @@ def test_relister(lister, swh_app, celery_session_worker):
break
sleep(1)
lister.assert_called_with(
api_baseurl='https://gitlab.com/api/v4',
instance=None, sort='asc', per_page=20)
lister.assert_called_with()
# one by the FullGitlabRelister task
# + 9 for the RangeGitlabLister subtasks
@ -113,7 +107,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.gitlab.tasks.FullGitLabRelister',
kwargs=dict(api_baseurl='https://0xacab.org/api/v4'))
kwargs=dict(url='https://0xacab.org/api/v4'))
assert res
res.wait()
@ -129,9 +123,7 @@ def test_relister_instance(lister, swh_app, celery_session_worker):
break
sleep(1)
lister.assert_called_with(
api_baseurl='https://0xacab.org/api/v4',
instance=None, sort='asc', per_page=20)
lister.assert_called_with(url='https://0xacab.org/api/v4')
# one by the FullGitlabRelister task
# + 9 for the RangeGitlabLister subtasks

View file

@ -16,10 +16,9 @@ class NpmListerBase(IndexingHttpLister):
LISTER_NAME = 'npm'
instance = 'npm'
def __init__(self, api_baseurl='https://replicate.npmjs.com',
def __init__(self, url='https://replicate.npmjs.com',
per_page=1000, override_config=None):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
super().__init__(url=url, override_config=override_config)
self.per_page = per_page + 1
self.PATH_TEMPLATE += '&limit=%s' % self.per_page
@ -76,7 +75,7 @@ class NpmListerBase(IndexingHttpLister):
'https://www.npmjs.com/package/%s' % repo_name,
# package metadata url needs to be escaped otherwise some requests
# may fail (for instance when a package name contains '/')
'%s/%s' % (self.api_baseurl, quote(repo_name, safe=''))
'%s/%s' % (self.url, quote(repo_name, safe=''))
)
def string_pattern_check(self, inner, lower, upper=None):

View file

@ -14,7 +14,7 @@ from swh.lister.npm.models import NpmVisitModel
@contextmanager
def save_registry_state(lister):
params = {'headers': lister.request_headers()}
registry_state = lister.session.get(lister.api_baseurl, **params)
registry_state = lister.session.get(lister.url, **params)
registry_state = registry_state.json()
keys = ('doc_count', 'doc_del_count', 'update_seq', 'purge_seq',
'disk_size', 'data_size', 'committed_update_seq',

View file

@ -20,11 +20,10 @@ class PhabricatorLister(IndexingHttpLister):
MODEL = PhabricatorModel
LISTER_NAME = 'phabricator'
def __init__(self, api_baseurl=None, instance=None, override_config=None):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
def __init__(self, url=None, instance=None, override_config=None):
super().__init__(url=url, override_config=override_config)
if not instance:
instance = urllib.parse.urlparse(self.api_baseurl).hostname
instance = urllib.parse.urlparse(self.url).hostname
self.instance = instance
@property

View file

@ -33,9 +33,8 @@ class PhabricatorListerTester(HttpListerTester, unittest.TestCase):
]}}
override_config = dict(credentials=credentials,
**(override_config or {}))
self.fl = self.Lister(
api_baseurl='https://fakeurl', instance='fake',
override_config=override_config)
self.fl = self.Lister(url='https://fakeurl', instance='fake',
override_config=override_config)
self.fl.INITIAL_BACKOFF = 1
self.fl.reset_backoff()

View file

@ -67,34 +67,34 @@ def test_get_lister_override():
db_url = init_db().url()
listers = {
'gitlab': ('api_baseurl', 'https://gitlab.uni/api/v4/'),
'phabricator': (
'api_baseurl',
'https://somewhere.org/api/diffusion.repository.search'),
'gitlab': 'https://other.gitlab.uni/api/v4/',
'phabricator': 'https://somewhere.org/api/diffusion.repository.search',
'cgit': 'https://some.where/cgit',
}
# check the override ends up defined in the lister
for lister_name, (url_key, url_value) in listers.items():
for lister_name, url in listers.items():
lst = get_lister(
lister_name, db_url, **{
url_key: url_value,
'url': url,
'priority': 'high',
'policy': 'oneshot',
})
assert getattr(lst, url_key) == url_value
assert lst.url == url
assert lst.config['priority'] == 'high'
assert lst.config['policy'] == 'oneshot'
# check the default urls are used and not the override (since it's not
# passed)
for lister_name, (url_key, url_value) in listers.items():
for lister_name, url in listers.items():
lst = get_lister(lister_name, db_url)
# no override so this does not end up in lister's configuration
assert url_key not in lst.config
assert 'url' not in lst.config
assert 'priority' not in lst.config
assert 'oneshot' not in lst.config
assert lst.url == lst.DEFAULT_URL
def test_task_types(swh_scheduler_config, tmp_path):