relister: Fix consistently the behavior for the first time relisting
If nothing has been done prior to a full relisting, there is actually nothing
to list. So the relister in question does nothing.
In that context, the IndexingLister class's `db_partition_indices` method now
returns an empty list instead of raising a ValueError when there is nothing to
list.
Related T1826
Related e129e48
This commit is contained in:
parent
6662ae8db5
commit
b99617f976
6 changed files with 44 additions and 39 deletions
|
@ -2,10 +2,11 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import iso8601
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from urllib import parse
|
||||
|
||||
from swh.lister.bitbucket.models import BitBucketModel
|
||||
|
@ -23,6 +24,7 @@ class BitBucketLister(IndexingHttpLister):
|
|||
MODEL = BitBucketModel
|
||||
LISTER_NAME = 'bitbucket'
|
||||
instance = 'bitbucket'
|
||||
default_min_bound = datetime.utcfromtimestamp(0).isoformat()
|
||||
|
||||
def __init__(self, api_baseurl, override_config=None, per_page=100):
|
||||
super().__init__(
|
||||
|
@ -54,21 +56,6 @@ class BitBucketLister(IndexingHttpLister):
|
|||
repos = response.json()['values']
|
||||
return [self.get_model_from_repo(repo) for repo in repos]
|
||||
|
||||
def db_first_index(self):
|
||||
"""For the first time listing, there is no data in db, so fallback to the
|
||||
bitbucket starting year.
|
||||
|
||||
"""
|
||||
return super().db_first_index() or '2008-01-01T00:00:00Z'
|
||||
|
||||
def db_last_index(self):
|
||||
"""For the first time listing, there is no data in db, so fallback to the time
|
||||
of the first run as max date.
|
||||
|
||||
"""
|
||||
return super().db_last_index() or datetime.datetime.now(
|
||||
tz=datetime.timezone.utc).isoformat()
|
||||
|
||||
def request_uri(self, identifier):
|
||||
return super().request_uri(identifier or '1970-01-01')
|
||||
|
||||
|
|
|
@ -30,8 +30,17 @@ def range_bitbucket_lister(start, end, **lister_args):
|
|||
|
||||
@app.task(name=__name__ + '.FullBitBucketRelister', bind=True)
|
||||
def full_bitbucket_relister(self, split=None, **lister_args):
|
||||
"""Relist from the beginning of what's already been listed.
|
||||
|
||||
It's not to be called for an initial listing.
|
||||
|
||||
"""
|
||||
lister = new_lister(**lister_args)
|
||||
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
|
||||
if not ranges:
|
||||
self.log.info('Nothing to list')
|
||||
return
|
||||
|
||||
random.shuffle(ranges)
|
||||
promise = group(range_bitbucket_lister.s(minv, maxv, **lister_args)
|
||||
for minv, maxv in ranges)()
|
||||
|
|
|
@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
class IndexingLister(ListerBase):
|
||||
flush_packet_db = 20
|
||||
default_min_bound = ''
|
||||
"""Lister* intermediate class for any service that follows the pattern:
|
||||
|
||||
- The service must report at least one stable unique identifier, known
|
||||
|
@ -95,17 +96,18 @@ class IndexingLister(ListerBase):
|
|||
|
||||
def db_partition_indices(self, partition_size):
|
||||
"""Describe an index-space compartmentalization of the db table
|
||||
in equal sized chunks. This is used to describe min&max bounds for
|
||||
parallelizing fetch tasks.
|
||||
in equal sized chunks. This is used to describe min&max bounds for
|
||||
parallelizing fetch tasks.
|
||||
|
||||
Args:
|
||||
partition_size (int): desired size to make each partition
|
||||
|
||||
Returns:
|
||||
a list of tuples (begin, end) of indexable value that
|
||||
declare approximately equal-sized ranges of existing
|
||||
repos
|
||||
"""
|
||||
declare approximately equal-sized ranges of existing
|
||||
repos
|
||||
|
||||
"""
|
||||
n = max(self.db_num_entries(), 10)
|
||||
partition_size = min(partition_size, n)
|
||||
n_partitions = n // partition_size
|
||||
|
@ -114,7 +116,8 @@ class IndexingLister(ListerBase):
|
|||
max_index = self.db_last_index()
|
||||
|
||||
if not min_index or not max_index:
|
||||
raise ValueError("Can't partition an empty range")
|
||||
# Nothing to list
|
||||
return []
|
||||
|
||||
if isinstance(min_index, str):
|
||||
def format_bound(bound):
|
||||
|
@ -201,7 +204,7 @@ class IndexingLister(ListerBase):
|
|||
self.max_index = max_bound
|
||||
|
||||
def ingest_indexes():
|
||||
index = min_bound or ''
|
||||
index = min_bound or self.default_min_bound
|
||||
for i in count(1):
|
||||
response, injected_repos = self.ingest_data(index)
|
||||
if not response and not injected_repos:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2017-2018 the Software Heritage developers
|
||||
# Copyright (C) 2017-2019 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -30,8 +30,16 @@ def range_github_lister(start, end, **lister_args):
|
|||
|
||||
@app.task(name=__name__ + '.FullGitHubRelister', bind=True)
|
||||
def full_github_relister(self, split=None, **lister_args):
|
||||
"""Relist from the beginning of what's already been listed.
|
||||
|
||||
It's not to be called for an initial listing.
|
||||
|
||||
"""
|
||||
lister = new_lister(**lister_args)
|
||||
ranges = lister.db_partition_indices(split or GROUP_SPLIT)
|
||||
if not ranges:
|
||||
self.log.info('Nothing to list')
|
||||
return
|
||||
random.shuffle(ranges)
|
||||
promise = group(range_github_lister.s(minv, maxv, **lister_args)
|
||||
for minv, maxv in ranges)()
|
||||
|
|
|
@ -38,6 +38,11 @@ def range_gitlab_lister(start, end, **lister_args):
|
|||
|
||||
@app.task(name=__name__ + '.FullGitLabRelister', bind=True)
|
||||
def full_gitlab_relister(self, **lister_args):
|
||||
"""Full lister
|
||||
|
||||
This should be renamed as such.
|
||||
|
||||
"""
|
||||
lister = new_lister(**lister_args)
|
||||
_, total_pages, _ = lister.get_pages_information()
|
||||
ranges = list(utils.split_range(total_pages, NBPAGES))
|
||||
|
|
|
@ -31,6 +31,14 @@ class PhabricatorLister(IndexingHttpLister):
|
|||
super().__init__(api_baseurl=api_baseurl,
|
||||
override_config=override_config)
|
||||
|
||||
@property
|
||||
def default_min_bound(self):
|
||||
"""Starting boundary when `min_bound` is not defined (db empty). This
|
||||
is used within the fn:`run` call.
|
||||
|
||||
"""
|
||||
return self._bootstrap_repositories_listing()
|
||||
|
||||
def _build_query_params(self, params, api_token):
|
||||
"""Build query params to include the forge's api token
|
||||
|
||||
|
@ -134,21 +142,6 @@ class PhabricatorLister(IndexingHttpLister):
|
|||
self.schedule_missing_tasks(models_list, injected)
|
||||
return self.max_index
|
||||
|
||||
def run(self, min_bound=None, max_bound=None):
|
||||
"""
|
||||
(Override) Run the lister on the specified Phabricator instance
|
||||
|
||||
Args:
|
||||
min_bound (int): Optional repository index to start the listing
|
||||
after it
|
||||
max_bound (int): Optional repository index to stop the listing
|
||||
after it
|
||||
"""
|
||||
# initial call to the lister, we need to bootstrap it in that case
|
||||
if min_bound is None:
|
||||
min_bound = self._bootstrap_repositories_listing()
|
||||
super().run(min_bound, max_bound)
|
||||
|
||||
|
||||
def get_repo_url(attachments):
|
||||
"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue