diff --git a/swh/lister/gitea/tests/test_tasks.py b/swh/lister/gitea/tests/test_tasks.py index 5c070b5..c780510 100644 --- a/swh/lister/gitea/tests/test_tasks.py +++ b/swh/lister/gitea/tests/test_tasks.py @@ -1,150 +1,145 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from time import sleep + from celery.result import GroupResult +from unittest.mock import patch, call -from unittest.mock import patch +from swh.lister.gitea.tasks import NBPAGES +from swh.lister.utils import split_range def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.gitea.tasks.ping") assert res res.wait() assert res.successful() assert res.result == "OK" @patch("swh.lister.gitea.tasks.GiteaLister") def test_incremental(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GiteaLister lister.return_value = lister lister.run.return_value = None lister.get_pages_information.return_value = (None, 10, None) res = swh_scheduler_celery_app.send_task( "swh.lister.gitea.tasks.IncrementalGiteaLister" ) assert res res.wait() assert res.successful() lister.assert_called_once_with(order="desc") lister.db_last_index.assert_not_called() lister.get_pages_information.assert_called_once_with() lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True) @patch("swh.lister.gitea.tasks.GiteaLister") def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GiteaLister lister.return_value = lister lister.run.return_value = None res = swh_scheduler_celery_app.send_task( "swh.lister.gitea.tasks.RangeGiteaLister", kwargs=dict(start=12, end=42) ) assert res res.wait() assert res.successful() lister.assert_called_once_with() lister.db_last_index.assert_not_called() lister.run.assert_called_once_with(min_bound=12, max_bound=42) @patch("swh.lister.gitea.tasks.GiteaLister") def test_relister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + total_pages = 85 # setup the mocked GiteaLister lister.return_value = lister lister.run.return_value = None - lister.get_pages_information.return_value = (None, 85, None) - lister.db_partition_indices.return_value = [ - (i, i + 9) for i in range(0, 80, 10) - ] + [(80, 85)] + lister.get_pages_information.return_value = (None, total_pages, None) res = swh_scheduler_celery_app.send_task("swh.lister.gitea.tasks.FullGiteaRelister") assert res res.wait() assert res.successful() # retrieve the GroupResult for this task and wait for all the subtasks # to complete promise_id = res.result assert promise_id promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) for i in range(5): if promise.ready(): break sleep(1) lister.assert_called_with() # one by the FullGiteaRelister task # + 9 for the RangeGiteaLister subtasks assert lister.call_count == 10 lister.db_last_index.assert_not_called() lister.db_partition_indices.assert_not_called() lister.get_pages_information.assert_called_once_with() # lister.run should have been called once per partition interval - for i in range(8): - # XXX inconsistent behavior: max_bound is EXCLUDED here + for min_bound, max_bound in split_range(total_pages, NBPAGES): assert ( - dict(min_bound=10 * i, max_bound=10 * i + 10), - ) in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list + call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list + ) @patch("swh.lister.gitea.tasks.GiteaLister") def test_relister_instance( lister, swh_scheduler_celery_app, swh_scheduler_celery_worker ): + total_pages = 85 # setup the mocked GiteaLister lister.return_value = lister lister.run.return_value = None - lister.get_pages_information.return_value = (None, 85, None) - lister.db_partition_indices.return_value = [ - (i, i + 9) for i in range(0, 80, 10) - ] + [(80, 85)] + lister.get_pages_information.return_value = (None, total_pages, None) res = swh_scheduler_celery_app.send_task( "swh.lister.gitea.tasks.FullGiteaRelister", kwargs=dict(url="https://0xacab.org/api/v4"), ) assert res res.wait() assert res.successful() # retrieve the GroupResult for this task and wait for all the subtasks # to complete promise_id = res.result assert promise_id promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) for i in range(5): if promise.ready(): break sleep(1) lister.assert_called_with(url="https://0xacab.org/api/v4") # one by the FullGiteaRelister task # + 9 for the RangeGiteaLister subtasks assert lister.call_count == 10 lister.db_last_index.assert_not_called() lister.db_partition_indices.assert_not_called() lister.get_pages_information.assert_called_once_with() # lister.run should have been called once per partition interval - for i in range(8): - # XXX inconsistent behavior: max_bound is EXCLUDED here + for min_bound, max_bound in split_range(total_pages, NBPAGES): assert ( - dict(min_bound=10 * i, max_bound=10 * i + 10), - ) in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list + call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list + ) diff --git a/swh/lister/gitlab/tests/test_tasks.py b/swh/lister/gitlab/tests/test_tasks.py index 1e7f27e..466da99 100644 --- a/swh/lister/gitlab/tests/test_tasks.py +++ b/swh/lister/gitlab/tests/test_tasks.py @@ -1,152 +1,147 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from time import sleep + from celery.result import GroupResult +from unittest.mock import patch, call -from unittest.mock import patch +from swh.lister.gitea.tasks import NBPAGES +from swh.lister.utils import split_range def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.gitlab.tasks.ping") assert res res.wait() assert res.successful() assert res.result == "OK" @patch("swh.lister.gitlab.tasks.GitLabLister") def test_incremental(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None lister.get_pages_information.return_value = (None, 10, None) res = swh_scheduler_celery_app.send_task( "swh.lister.gitlab.tasks.IncrementalGitLabLister" ) assert res res.wait() assert res.successful() lister.assert_called_once_with(sort="desc") lister.db_last_index.assert_not_called() lister.get_pages_information.assert_called_once_with() lister.run.assert_called_once_with(min_bound=1, max_bound=10, check_existence=True) @patch("swh.lister.gitlab.tasks.GitLabLister") def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None res = swh_scheduler_celery_app.send_task( "swh.lister.gitlab.tasks.RangeGitLabLister", kwargs=dict(start=12, end=42) ) assert res res.wait() assert res.successful() lister.assert_called_once_with() lister.db_last_index.assert_not_called() lister.run.assert_called_once_with(min_bound=12, max_bound=42) @patch("swh.lister.gitlab.tasks.GitLabLister") def test_relister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + total_pages = 85 # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None - lister.get_pages_information.return_value = (None, 85, None) - lister.db_partition_indices.return_value = [ - (i, i + 9) for i in range(0, 80, 10) - ] + [(80, 85)] + lister.get_pages_information.return_value = (None, total_pages, None) res = swh_scheduler_celery_app.send_task( "swh.lister.gitlab.tasks.FullGitLabRelister" ) assert res res.wait() assert res.successful() # retrieve the GroupResult for this task and wait for all the subtasks # to complete promise_id = res.result assert promise_id promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) for i in range(5): if promise.ready(): break sleep(1) lister.assert_called_with() # one by the FullGitlabRelister task # + 9 for the RangeGitlabLister subtasks assert lister.call_count == 10 lister.db_last_index.assert_not_called() lister.db_partition_indices.assert_not_called() lister.get_pages_information.assert_called_once_with() # lister.run should have been called once per partition interval - for i in range(8): - # XXX inconsistent behavior: max_bound is EXCLUDED here + for min_bound, max_bound in split_range(total_pages, NBPAGES): assert ( - dict(min_bound=10 * i, max_bound=10 * i + 10), - ) in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list + call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list + ) @patch("swh.lister.gitlab.tasks.GitLabLister") def test_relister_instance( lister, swh_scheduler_celery_app, swh_scheduler_celery_worker ): + total_pages = 85 # setup the mocked GitlabLister lister.return_value = lister lister.run.return_value = None - lister.get_pages_information.return_value = (None, 85, None) - lister.db_partition_indices.return_value = [ - (i, i + 9) for i in range(0, 80, 10) - ] + [(80, 85)] + lister.get_pages_information.return_value = (None, total_pages, None) res = swh_scheduler_celery_app.send_task( "swh.lister.gitlab.tasks.FullGitLabRelister", kwargs=dict(url="https://0xacab.org/api/v4"), ) assert res res.wait() assert res.successful() # retrieve the GroupResult for this task and wait for all the subtasks # to complete promise_id = res.result assert promise_id promise = GroupResult.restore(promise_id, app=swh_scheduler_celery_app) for i in range(5): if promise.ready(): break sleep(1) lister.assert_called_with(url="https://0xacab.org/api/v4") # one by the FullGitlabRelister task # + 9 for the RangeGitlabLister subtasks assert lister.call_count == 10 lister.db_last_index.assert_not_called() lister.db_partition_indices.assert_not_called() lister.get_pages_information.assert_called_once_with() # lister.run should have been called once per partition interval - for i in range(8): - # XXX inconsistent behavior: max_bound is EXCLUDED here + for min_bound, max_bound in split_range(total_pages, NBPAGES): assert ( - dict(min_bound=10 * i, max_bound=10 * i + 10), - ) in lister.run.call_args_list - assert (dict(min_bound=80, max_bound=85),) in lister.run.call_args_list + call(min_bound=min_bound, max_bound=max_bound) in lister.run.call_args_list + ) diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index f793b4c..defde1a 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,37 +1,42 @@ # Copyright (C) 2018-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from testing.postgresql import Postgresql from swh.lister import utils @pytest.mark.parametrize( "total_pages,nb_pages,expected_ranges", - [(14, 5, [(0, 5), (5, 10), (10, 14)]), (19, 10, [(0, 10), (10, 19)])], + [ + (14, 5, [(0, 4), (5, 9), (10, 14)]), + (19, 10, [(0, 9), (10, 19)]), + (20, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)]), + (21, 3, [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21),],), + ], ) def test_split_range(total_pages, nb_pages, expected_ranges): actual_ranges = list(utils.split_range(total_pages, nb_pages)) assert actual_ranges == expected_ranges @pytest.mark.parametrize("total_pages,nb_pages", [(None, 1), (100, None)]) def test_split_range_errors(total_pages, nb_pages): for total_pages, nb_pages in [(None, 1), (100, None)]: with pytest.raises(TypeError): next(utils.split_range(total_pages, nb_pages)) def init_db(): """Factorize the db_url instantiation Returns: db object to ease db manipulation """ initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"] initdb_args = " ".join([initdb_args, "-E UTF-8"]) return Postgresql(initdb_args=initdb_args) diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 68e8b82..3576608 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,14 +1,29 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2020 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Iterator, Tuple -def split_range(total_pages, nb_pages): + +def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: + """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can + have one more element. + + >>> split_range(19, 10) + [(0, 9), (10, 19)] + + >>> split_range(20, 3) + [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 20)] + + >>> split_range(21, 3) + [(0, 2), (3, 5), (6, 8), (9, 11), (12, 14), (15, 17), (18, 21)] + + """ prev_index = None for index in range(0, total_pages, nb_pages): if index is not None and prev_index is not None: - yield prev_index, index + yield prev_index, index - 1 prev_index = index if index != total_pages: yield index, total_pages