relister: Fix consistently the behavior for the first time relisting

If nothing has been done prior to a full relisting, there is actually nothing
to list. So the relister in question does nothing.

In that context, the IndexingLister class's `db_partition_indices` method now
returns an empty list instead of raising a ValueError when there is nothing to
list.

Related T1826
Related e129e48
This commit is contained in:
Antoine R. Dumont (@ardumont) 2019-06-21 19:42:17 +02:00
parent 6662ae8db5
commit b99617f976
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
6 changed files with 44 additions and 39 deletions

View file

@ -2,10 +2,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import iso8601
from datetime import datetime
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
@ -23,6 +24,7 @@ class BitBucketLister(IndexingHttpLister):
MODEL = BitBucketModel
LISTER_NAME = 'bitbucket'
instance = 'bitbucket'
default_min_bound = datetime.utcfromtimestamp(0).isoformat()
def __init__(self, api_baseurl, override_config=None, per_page=100):
super().__init__(
@ -54,21 +56,6 @@ class BitBucketLister(IndexingHttpLister):
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]
def db_first_index(self):
"""For the first time listing, there is no data in db, so fallback to the
bitbucket starting year.
"""
return super().db_first_index() or '2008-01-01T00:00:00Z'
def db_last_index(self):
"""For the first time listing, there is no data in db, so fallback to the time
of the first run as max date.
"""
return super().db_last_index() or datetime.datetime.now(
tz=datetime.timezone.utc).isoformat()
def request_uri(self, identifier):
return super().request_uri(identifier or '1970-01-01')

View file

@ -30,8 +30,17 @@ def range_bitbucket_lister(start, end, **lister_args):
@app.task(name=__name__ + '.FullBitBucketRelister', bind=True)
def full_bitbucket_relister(self, split=None, **lister_args):
"""Relist from the beginning of what's already been listed.
It's not to be called for an initial listing.
"""
lister = new_lister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')
return
random.shuffle(ranges)
promise = group(range_bitbucket_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()

View file

@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
class IndexingLister(ListerBase):
flush_packet_db = 20
default_min_bound = ''
"""Lister* intermediate class for any service that follows the pattern:
- The service must report at least one stable unique identifier, known
@ -95,17 +96,18 @@ class IndexingLister(ListerBase):
def db_partition_indices(self, partition_size):
"""Describe an index-space compartmentalization of the db table
in equal sized chunks. This is used to describe min&max bounds for
parallelizing fetch tasks.
in equal sized chunks. This is used to describe min&max bounds for
parallelizing fetch tasks.
Args:
partition_size (int): desired size to make each partition
Returns:
a list of tuples (begin, end) of indexable value that
declare approximately equal-sized ranges of existing
repos
"""
declare approximately equal-sized ranges of existing
repos
"""
n = max(self.db_num_entries(), 10)
partition_size = min(partition_size, n)
n_partitions = n // partition_size
@ -114,7 +116,8 @@ class IndexingLister(ListerBase):
max_index = self.db_last_index()
if not min_index or not max_index:
raise ValueError("Can't partition an empty range")
# Nothing to list
return []
if isinstance(min_index, str):
def format_bound(bound):
@ -201,7 +204,7 @@ class IndexingLister(ListerBase):
self.max_index = max_bound
def ingest_indexes():
index = min_bound or ''
index = min_bound or self.default_min_bound
for i in count(1):
response, injected_repos = self.ingest_data(index)
if not response and not injected_repos:

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2018 the Software Heritage developers
# Copyright (C) 2017-2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -30,8 +30,16 @@ def range_github_lister(start, end, **lister_args):
@app.task(name=__name__ + '.FullGitHubRelister', bind=True)
def full_github_relister(self, split=None, **lister_args):
"""Relist from the beginning of what's already been listed.
It's not to be called for an initial listing.
"""
lister = new_lister(**lister_args)
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
if not ranges:
self.log.info('Nothing to list')
return
random.shuffle(ranges)
promise = group(range_github_lister.s(minv, maxv, **lister_args)
for minv, maxv in ranges)()

View file

@ -38,6 +38,11 @@ def range_gitlab_lister(start, end, **lister_args):
@app.task(name=__name__ + '.FullGitLabRelister', bind=True)
def full_gitlab_relister(self, **lister_args):
"""Full lister
This should be renamed as such.
"""
lister = new_lister(**lister_args)
_, total_pages, _ = lister.get_pages_information()
ranges = list(utils.split_range(total_pages, NBPAGES))

View file

@ -31,6 +31,14 @@ class PhabricatorLister(IndexingHttpLister):
super().__init__(api_baseurl=api_baseurl,
override_config=override_config)
@property
def default_min_bound(self):
"""Starting boundary when `min_bound` is not defined (db empty). This
is used within the fn:`run` call.
"""
return self._bootstrap_repositories_listing()
def _build_query_params(self, params, api_token):
"""Build query params to include the forge's api token
@ -134,21 +142,6 @@ class PhabricatorLister(IndexingHttpLister):
self.schedule_missing_tasks(models_list, injected)
return self.max_index
def run(self, min_bound=None, max_bound=None):
"""
(Override) Run the lister on the specified Phabricator instance
Args:
min_bound (int): Optional repository index to start the listing
after it
max_bound (int): Optional repository index to stop the listing
after it
"""
# initial call to the lister, we need to bootstrap it in that case
if min_bound is None:
min_bound = self._bootstrap_repositories_listing()
super().run(min_bound, max_bound)
def get_repo_url(attachments):
"""