From 4f57e844506205182ce4fc7029401cd80a357547 Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Thu, 13 Apr 2023 14:19:39 +0200 Subject: [PATCH] Use http_retry decorator from swh.core.retry module The http_retry decorator has been moved to swh-core package in order to ease its reuse across swh packages. --- docs/tutorial.rst | 2 +- requirements-swh.txt | 2 +- swh/lister/bitbucket/tests/test_lister.py | 3 +- swh/lister/gitlab/lister.py | 4 +- swh/lister/gitlab/tests/test_lister.py | 4 +- swh/lister/golang/tests/test_lister.py | 7 +- swh/lister/launchpad/lister.py | 4 +- swh/lister/pattern.py | 6 +- swh/lister/pypi/lister.py | 4 +- swh/lister/sourceforge/tests/test_lister.py | 7 +- swh/lister/tests/test_utils.py | 94 +-------------------- swh/lister/tests/utils.py | 8 ++ swh/lister/utils.py | 88 +------------------ 13 files changed, 32 insertions(+), 201 deletions(-) create mode 100644 swh/lister/tests/utils.py diff --git a/docs/tutorial.rst b/docs/tutorial.rst index c01195e..eafafd9 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -318,7 +318,7 @@ then immediately stop the listing by doing an equivalent of :py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting errors, we have a strategy of using a flexible decorator to handle the retrying for us. It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from -:py:mod:`swh.lister.utils`. +:py:mod:`swh.core.retry`. Pagination ^^^^^^^^^^ diff --git a/requirements-swh.txt b/requirements-swh.txt index 7c34143..6bf26fe 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ -swh.core[db,github] >= 2.16.1 +swh.core[db,github] >= 2.22.0 swh.scheduler >= 0.8 diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py index 7ca2b0b..7a78ae2 100644 --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -9,8 +9,8 @@ import os import pytest +from swh.core.retry import MAX_NUMBER_ATTEMPTS from swh.lister.bitbucket.lister import BitbucketLister -from swh.lister.utils import MAX_NUMBER_ATTEMPTS @pytest.fixture @@ -188,7 +188,6 @@ def test_bitbucket_lister_buggy_page( bb_api_repositories_page1, bb_api_repositories_page2, ): - requests_mock.get( BitbucketLister.API_URL, [ diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 3ad2bfd..399b840 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2022 The Software Heritage developers +# Copyright (C) 2018-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,8 +14,8 @@ from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log +from swh.core.retry import http_retry, is_retryable_exception from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py index d1fd797..dc99402 100644 --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -12,11 +12,11 @@ from typing import Dict, List import pytest from requests.status_codes import codes +from swh.core.retry import WAIT_EXP_BASE from swh.lister import USER_AGENT_TEMPLATE from swh.lister.gitlab.lister import GitLabLister, _parse_id_after from swh.lister.pattern import ListerStats -from swh.lister.tests.test_utils import assert_sleep_calls -from swh.lister.utils import WAIT_EXP_BASE +from swh.lister.tests.utils import assert_sleep_calls logger = logging.getLogger(__name__) diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py index e1abe90..94cea57 100644 --- a/swh/lister/golang/tests/test_lister.py +++ b/swh/lister/golang/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,9 +8,9 @@ from pathlib import Path import iso8601 +from swh.core.retry import WAIT_EXP_BASE from swh.lister.golang.lister import GolangLister, GolangStateType -from swh.lister.tests.test_utils import assert_sleep_calls -from swh.lister.utils import WAIT_EXP_BASE +from swh.lister.tests.utils import assert_sleep_calls # https://pkg.go.dev prefix omitted expected_listed = [ @@ -98,7 +98,6 @@ def _generate_responses(datadir, requests_mock): def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): - # Exponential retries take a long time, so stub time.sleep mocked_sleep = mocker.patch.object(GolangLister.http_request.retry, "sleep") diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index b9daa18..987154c 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,7 @@ from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry, retry_if_exception +from swh.core.retry import http_retry, retry_if_exception from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 014fdf3..31dc4c6 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,12 +16,13 @@ from tenacity.before_sleep import before_sleep_log from swh.core.config import load_from_envvar from swh.core.github.utils import GitHubSession +from swh.core.retry import http_retry from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface from . import USER_AGENT_TEMPLATE -from .utils import http_retry, is_valid_origin_url +from .utils import is_valid_origin_url logger = logging.getLogger(__name__) @@ -153,7 +154,6 @@ class Lister(Generic[StateType, PageType]): @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: - logger.debug("Fetching URL %s with params %s", url, kwargs.get("params")) response = self.session.request(method, url, **kwargs) diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 64f14fa..f5141c1 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,7 +13,7 @@ from xmlrpc.client import Fault, ServerProxy from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry +from swh.core.retry import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py index 1a97bf3..4261ce5 100644 --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,6 +13,7 @@ from iso8601 import iso8601 import pytest from requests.exceptions import HTTPError +from swh.core.retry import WAIT_EXP_BASE from swh.lister import USER_AGENT_TEMPLATE from swh.lister.sourceforge.lister import ( MAIN_SITEMAP_URL, @@ -20,8 +21,7 @@ from swh.lister.sourceforge.lister import ( SourceForgeLister, SourceForgeListerState, ) -from swh.lister.tests.test_utils import assert_sleep_calls -from swh.lister.utils import WAIT_EXP_BASE +from swh.lister.tests.utils import assert_sleep_calls # Mapping of project name to namespace from swh.scheduler.model import ListedOrigin @@ -368,7 +368,6 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): - lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index 98b376f..e0cc672 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,13 +1,10 @@ -# Copyright (C) 2018-2022 the Software Heritage developers +# Copyright (C) 2018-2023 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest -import requests -from requests.status_codes import codes -from tenacity.wait import wait_fixed -from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range +from swh.lister.utils import split_range @pytest.mark.parametrize( @@ -41,90 +38,3 @@ def test_split_range_errors(total_pages, nb_pages): for total_pages, nb_pages in [(None, 1), (100, None)]: with pytest.raises(TypeError): next(split_range(total_pages, nb_pages)) - - -TEST_URL = "https://example.og/api/repositories" - - -@http_retry() -def make_request(): - response = requests.get(TEST_URL) - response.raise_for_status() - return response - - -def assert_sleep_calls(mocker, mock_sleep, sleep_params): - mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) - - -@pytest.mark.parametrize( - "status_code", - [ - codes.too_many_requests, - codes.internal_server_error, - codes.bad_gateway, - codes.service_unavailable, - ], -) -def test_http_retry(requests_mock, mocker, status_code): - data = {"result": {}} - requests_mock.get( - TEST_URL, - [ - {"status_code": status_code}, - {"status_code": status_code}, - {"status_code": codes.ok, "json": data}, - ], - ) - - mock_sleep = mocker.patch.object(make_request.retry, "sleep") - - response = make_request() - - assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE]) - - assert response.json() == data - - -def test_http_retry_max_attemps(requests_mock, mocker): - requests_mock.get( - TEST_URL, - [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), - ) - - mock_sleep = mocker.patch.object(make_request.retry, "sleep") - - with pytest.raises(requests.exceptions.HTTPError) as e: - make_request() - - assert e.value.response.status_code == codes.too_many_requests - - assert_sleep_calls( - mocker, - mock_sleep, - [float(WAIT_EXP_BASE**i) for i in range(MAX_NUMBER_ATTEMPTS - 1)], - ) - - -@http_retry(wait=wait_fixed(WAIT_EXP_BASE)) -def make_request_wait_fixed(): - response = requests.get(TEST_URL) - response.raise_for_status() - return response - - -def test_http_retry_wait_fixed(requests_mock, mocker): - requests_mock.get( - TEST_URL, - [ - {"status_code": codes.too_many_requests}, - {"status_code": codes.too_many_requests}, - {"status_code": codes.ok}, - ], - ) - - mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep") - - make_request_wait_fixed() - - assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2) diff --git a/swh/lister/tests/utils.py b/swh/lister/tests/utils.py new file mode 100644 index 0000000..d4e15c4 --- /dev/null +++ b/swh/lister/tests/utils.py @@ -0,0 +1,8 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def assert_sleep_calls(mocker, mock_sleep, sleep_params): + mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 3220d4d..60cfc93 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,16 +1,10 @@ -# Copyright (C) 2018-2022 the Software Heritage developers +# Copyright (C) 2018-2023 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Callable, Iterator, Optional, Tuple +from typing import Iterator, Optional, Tuple import urllib.parse -from requests.exceptions import ConnectionError, HTTPError -from requests.status_codes import codes -from tenacity import retry as tenacity_retry -from tenacity.stop import stop_after_attempt -from tenacity.wait import wait_exponential - def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can @@ -36,84 +30,6 @@ def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]: yield index, total_pages -def is_throttling_exception(e: Exception) -> bool: - """ - Checks if an exception is a requests.exception.HTTPError for - a response with status code 429 (too many requests). - """ - return ( - isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests - ) - - -def is_retryable_exception(e: Exception) -> bool: - """ - Checks if an exception is worth retrying (connection, throttling or a server error). - """ - is_connection_error = isinstance(e, ConnectionError) - is_500_error = isinstance(e, HTTPError) and e.response.status_code >= 500 - - return is_connection_error or is_throttling_exception(e) or is_500_error - - -def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> bool: - """ - Custom tenacity retry predicate for handling exceptions with the given predicate. - """ - attempt = retry_state.outcome - if attempt.failed: - exception = attempt.exception() - return predicate(exception) - return False - - -def retry_policy_generic(retry_state) -> bool: - """ - Custom tenacity retry predicate for handling failed requests: - - ConnectionError - - Server errors (status >= 500) - - Throttling errors (status == 429) - - This does not handle 404, 403 or other status codes. - """ - return retry_if_exception(retry_state, is_retryable_exception) - - -WAIT_EXP_BASE = 10 -MAX_NUMBER_ATTEMPTS = 5 - - -def http_retry( - retry=retry_policy_generic, - wait=wait_exponential(exp_base=WAIT_EXP_BASE), - stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), - **retry_args, -): - """ - Decorator based on `tenacity` for retrying a function possibly raising - requests.exception.HTTPError for status code 429 (too many requests). - - It provides a default configuration that should work properly in most - cases but all `tenacity.retry` parameters can also be overridden in client - code. - - When the mmaximum of attempts is reached, the HTTPError exception will then - be reraised. - - Args: - retry: function defining request retry condition (default to 429 status code) - https://tenacity.readthedocs.io/en/latest/#whether-to-retry - - wait: function defining wait strategy before retrying (default to exponential - backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying - - stop: function defining when to stop retrying (default after 5 attempts) - https://tenacity.readthedocs.io/en/latest/#stopping - - """ - return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args) - - def is_valid_origin_url(url: Optional[str]) -> bool: """Returns whether the given string is a valid origin URL. This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo``