utils.split_range: Split into not overlapping ranges

Existing listers use the `is_within_bound` [1] method from the base lister.
This method uses inclusive boundaries in all cases.

As some "range" task listers [2] [3] are using `split_range` function to create
"overlapping" ranges, this can cause concurrent insert issues down the line [4].

This commit adapts the function `split_range` to make the generated ranges no
longer overlap.

[1]
https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/core/lister_base.py$194-199

[2]
https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/tasks.py$37-41

[3]
https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitea/tasks.py$36-41

Related to T2577
This commit is contained in:
Antoine R. Dumont (@ardumont) 2020-09-09 18:50:46 +02:00
parent 725c1fe4ad
commit e3c856b5ee
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
4 changed files with 54 additions and 44 deletions

View file

@ -4,9 +4,12 @@
# See top-level LICENSE file for more information
from time import sleep
from celery.result import GroupResult
from unittest.mock import patch
from celery.result import GroupResult
from unittest.mock import patch, call
from swh.lister.gitea.tasks import NBPAGES
from swh.lister.utils import split_range
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@ -57,13 +60,11 @@ def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_relister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
total_pages = 85
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
lister.get_pages_information.return_value = (None, total_pages, None)
res = swh_scheduler_celery_app.send_task("swh.lister.gitea.tasks.FullGiteaRelister")
assert res
@ -92,25 +93,21 @@ def test_relister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker)
lister.get_pages_information.assert_called_once_with()
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
for min_bound, max_bound in split_range(total_pages, NBPAGES):
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list
call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list
)
@patch("swh.lister.gitea.tasks.GiteaLister")
def test_relister_instance(
lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
):
total_pages = 85
# setup the mocked GiteaLister
lister.return_value = lister
lister.run.return_value = None
lister.get_pages_information.return_value = (None, 85, None)
lister.db_partition_indices.return_value = [
(i, i + 9) for i in range(0, 80, 10)
] + [(80, 85)]
lister.get_pages_information.return_value = (None, total_pages, None)
res = swh_scheduler_celery_app.send_task(
"swh.lister.gitea.tasks.FullGiteaRelister",
@ -142,9 +139,7 @@ def test_relister_instance(
lister.get_pages_information.assert_called_once_with()
# lister.run should have been called once per partition interval
for i in range(8):
# XXX inconsistent behavior: max_bound is EXCLUDED here
for min_bound, max_bound in split_range(total_pages, NBPAGES):
assert (
dict(min_bound=10 * i, max_bound=10 * i + 10),
) in lister.run.call_args_list
assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list
call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list
)