From b99617f97615d30cd6570fd951e317d00bb330d3 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Fri, 21 Jun 2019 19:42:17 +0200 Subject: [PATCH] relister: Fix consistently the behavior for the first time relisting If nothing has been done prior to a full relisting, there is actually nothing to list. So the relister in question does nothing. In that context, the IndexingLister class's `db_partition_indices` method now returns an empty list instead of raising a ValueError when there is nothing to list. Related T1826 Related e129e48 --- swh/lister/bitbucket/lister.py | 19 +++---------------- swh/lister/bitbucket/tasks.py | 9 +++++++++ swh/lister/core/indexing_lister.py | 17 ++++++++++------- swh/lister/github/tasks.py | 10 +++++++++- swh/lister/gitlab/tasks.py | 5 +++++ swh/lister/phabricator/lister.py | 23 ++++++++--------------- 6 files changed, 44 insertions(+), 39 deletions(-) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index d7dfe51..a5cf91b 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -2,10 +2,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime import logging import iso8601 +from datetime import datetime + from urllib import parse from swh.lister.bitbucket.models import BitBucketModel @@ -23,6 +24,7 @@ class BitBucketLister(IndexingHttpLister): MODEL = BitBucketModel LISTER_NAME = 'bitbucket' instance = 'bitbucket' + default_min_bound = datetime.utcfromtimestamp(0).isoformat() def __init__(self, api_baseurl, override_config=None, per_page=100): super().__init__( @@ -54,21 +56,6 @@ class BitBucketLister(IndexingHttpLister): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] - def db_first_index(self): - """For the first time listing, there is no data in db, so fallback to the - bitbucket starting year. - - """ - return super().db_first_index() or '2008-01-01T00:00:00Z' - - def db_last_index(self): - """For the first time listing, there is no data in db, so fallback to the time - of the first run as max date. - - """ - return super().db_last_index() or datetime.datetime.now( - tz=datetime.timezone.utc).isoformat() - def request_uri(self, identifier): return super().request_uri(identifier or '1970-01-01') diff --git a/swh/lister/bitbucket/tasks.py b/swh/lister/bitbucket/tasks.py index 1bb00c9..a084846 100644 --- a/swh/lister/bitbucket/tasks.py +++ b/swh/lister/bitbucket/tasks.py @@ -30,8 +30,17 @@ def range_bitbucket_lister(start, end, **lister_args): @app.task(name=__name__ + '.FullBitBucketRelister', bind=True) def full_bitbucket_relister(self, split=None, **lister_args): + """Relist from the beginning of what's already been listed. + + It's not to be called for an initial listing. + + """ lister = new_lister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) + if not ranges: + self.log.info('Nothing to list') + return + random.shuffle(ranges) promise = group(range_bitbucket_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges)() diff --git a/swh/lister/core/indexing_lister.py b/swh/lister/core/indexing_lister.py index aa913e4..5436da1 100644 --- a/swh/lister/core/indexing_lister.py +++ b/swh/lister/core/indexing_lister.py @@ -17,6 +17,7 @@ logger = logging.getLogger(__name__) class IndexingLister(ListerBase): flush_packet_db = 20 + default_min_bound = '' """Lister* intermediate class for any service that follows the pattern: - The service must report at least one stable unique identifier, known @@ -95,17 +96,18 @@ class IndexingLister(ListerBase): def db_partition_indices(self, partition_size): """Describe an index-space compartmentalization of the db table - in equal sized chunks. This is used to describe min&max bounds for - parallelizing fetch tasks. + in equal sized chunks. This is used to describe min&max bounds for + parallelizing fetch tasks. Args: partition_size (int): desired size to make each partition + Returns: a list of tuples (begin, end) of indexable value that - declare approximately equal-sized ranges of existing - repos - """ + declare approximately equal-sized ranges of existing + repos + """ n = max(self.db_num_entries(), 10) partition_size = min(partition_size, n) n_partitions = n // partition_size @@ -114,7 +116,8 @@ class IndexingLister(ListerBase): max_index = self.db_last_index() if not min_index or not max_index: - raise ValueError("Can't partition an empty range") + # Nothing to list + return [] if isinstance(min_index, str): def format_bound(bound): @@ -201,7 +204,7 @@ class IndexingLister(ListerBase): self.max_index = max_bound def ingest_indexes(): - index = min_bound or '' + index = min_bound or self.default_min_bound for i in count(1): response, injected_repos = self.ingest_data(index) if not response and not injected_repos: diff --git a/swh/lister/github/tasks.py b/swh/lister/github/tasks.py index 7d91dc9..bc3f8c2 100644 --- a/swh/lister/github/tasks.py +++ b/swh/lister/github/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -30,8 +30,16 @@ def range_github_lister(start, end, **lister_args): @app.task(name=__name__ + '.FullGitHubRelister', bind=True) def full_github_relister(self, split=None, **lister_args): + """Relist from the beginning of what's already been listed. + + It's not to be called for an initial listing. + + """ lister = new_lister(**lister_args) ranges = lister.db_partition_indices(split or GROUP_SPLIT) + if not ranges: + self.log.info('Nothing to list') + return random.shuffle(ranges) promise = group(range_github_lister.s(minv, maxv, **lister_args) for minv, maxv in ranges)() diff --git a/swh/lister/gitlab/tasks.py b/swh/lister/gitlab/tasks.py index aee2c19..eff3114 100644 --- a/swh/lister/gitlab/tasks.py +++ b/swh/lister/gitlab/tasks.py @@ -38,6 +38,11 @@ def range_gitlab_lister(start, end, **lister_args): @app.task(name=__name__ + '.FullGitLabRelister', bind=True) def full_gitlab_relister(self, **lister_args): + """Full lister + + This should be renamed as such. + + """ lister = new_lister(**lister_args) _, total_pages, _ = lister.get_pages_information() ranges = list(utils.split_range(total_pages, NBPAGES)) diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 579bd31..9b8059e 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -31,6 +31,14 @@ class PhabricatorLister(IndexingHttpLister): super().__init__(api_baseurl=api_baseurl, override_config=override_config) + @property + def default_min_bound(self): + """Starting boundary when `min_bound` is not defined (db empty). This + is used within the fn:`run` call. + + """ + return self._bootstrap_repositories_listing() + def _build_query_params(self, params, api_token): """Build query params to include the forge's api token @@ -134,21 +142,6 @@ class PhabricatorLister(IndexingHttpLister): self.schedule_missing_tasks(models_list, injected) return self.max_index - def run(self, min_bound=None, max_bound=None): - """ - (Override) Run the lister on the specified Phabricator instance - - Args: - min_bound (int): Optional repository index to start the listing - after it - max_bound (int): Optional repository index to stop the listing - after it - """ - # initial call to the lister, we need to bootstrap it in that case - if min_bound is None: - min_bound = self._bootstrap_repositories_listing() - super().run(min_bound, max_bound) - def get_repo_url(attachments): """