From 9c55acd286091acb6f6094e9fe1c95aca1fdeeec Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Wed, 21 Sep 2022 16:56:34 +0200 Subject: [PATCH] Use generic HTTP retry policy by default and rename dedicated decorator Instead of retrying HTTP requests only for 429 status code by default, prefer to use the generic retry policy enabling to also retry for status codes >= 500 but also on ConnectionError exceptions. Rename throttling_retry decorator to http_retry to reflect this change. --- docs/new_lister_template.py | 6 ++--- docs/tutorial.rst | 2 +- swh/lister/arch/lister.py | 4 ++-- swh/lister/bitbucket/lister.py | 4 ++-- swh/lister/bower/lister.py | 4 ++-- swh/lister/cgit/lister.py | 4 ++-- swh/lister/gitea/tests/test_lister.py | 2 +- swh/lister/gitlab/lister.py | 4 ++-- swh/lister/gogs/lister.py | 4 ++-- swh/lister/gogs/tests/test_lister.py | 2 +- swh/lister/golang/lister.py | 5 ++-- swh/lister/launchpad/lister.py | 4 ++-- swh/lister/maven/lister.py | 4 ++-- swh/lister/maven/tests/test_lister.py | 5 ++++ swh/lister/npm/lister.py | 4 ++-- swh/lister/npm/tests/test_lister.py | 5 ++++ swh/lister/pubdev/lister.py | 4 ++-- swh/lister/pypi/lister.py | 6 ++--- swh/lister/sourceforge/lister.py | 5 ++-- swh/lister/tests/test_utils.py | 32 +++++++++++++++----------- swh/lister/tuleap/lister.py | 4 ++-- swh/lister/tuleap/tests/test_lister.py | 5 ++++ swh/lister/utils.py | 14 +++-------- 23 files changed, 71 insertions(+), 62 deletions(-) diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py index 20e3e90..41e27a7 100644 --- a/docs/new_lister_template.py +++ b/docs/new_lister_template.py @@ -11,7 +11,7 @@ from urllib.parse import urljoin import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -77,11 +77,11 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]): def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: # Do the network resource request under a retrying decorator # to handle rate limiting and transient errors up to a limit. - # `throttling_retry` by default use the `requests` library to check + # `http_retry` by default use the `requests` library to check # only for rate-limit and a base-10 exponential waiting strategy. # This can be customized by passed waiting, retrying and logging strategies # as functions. See the `tenacity` library documentation. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d4ae380..c01195e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -317,7 +317,7 @@ We generally recommend logging every unhandleable error with the response conten then immediately stop the listing by doing an equivalent of :py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting errors, we have a strategy of using a flexible decorator to handle the retrying for us. -It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from +It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from :py:mod:`swh.lister.utils`. Pagination diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index af3a3d8..a933650 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -16,7 +16,7 @@ from bs4 import BeautifulSoup import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -132,7 +132,7 @@ class ArchLister(StatelessLister[ArchListerPage]): } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 6a99699..1c195b2 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -14,7 +14,7 @@ import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -107,7 +107,7 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): if username is not None and password is not None: self.session.auth = (username, password) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def page_request(self, last_repo_cdate: str) -> requests.Response: self.url_params["after"] = last_repo_cdate diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index f516b2b..64921e2 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -8,7 +8,7 @@ from typing import Any, Dict, Iterator, List, Optional import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -49,7 +49,7 @@ class BowerLister(StatelessLister[BowerListerPage]): } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 5ca9445..6fcfb54 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -15,7 +15,7 @@ from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -79,7 +79,7 @@ class CGitLister(StatelessLister[Repositories]): ) self.base_git_url = base_git_url - @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) + @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" response = self.session.get(url) diff --git a/swh/lister/gitea/tests/test_lister.py b/swh/lister/gitea/tests/test_lister.py index 8e3242b..0cf59ad 100644 --- a/swh/lister/gitea/tests/test_lister.py +++ b/swh/lister/gitea/tests/test_lister.py @@ -138,7 +138,7 @@ def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1): assert stats.pages == 1 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gitea_list_http_error( swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2 ): diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 61006b0..c878788 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -17,7 +17,7 @@ from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import is_retryable_exception, throttling_retry +from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]): def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def get_page_result(self, url: str) -> PageResult: diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py index 16d9626..17ec17f 100644 --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -12,7 +12,7 @@ import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -116,7 +116,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]): def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request( self, url: str, params: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py index bcac533..dfcd991 100644 --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -177,7 +177,7 @@ def test_gogs_auth_instance( assert stats.origins == 6 -@pytest.mark.parametrize("http_code", [400, 500, 502]) +@pytest.mark.parametrize("http_code", [400, 500]) def test_gogs_list_http_error( swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last ): diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 0a2f141..87ea9f8 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -13,7 +13,7 @@ import iso8601 import requests from tenacity import before_sleep_log -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -87,8 +87,7 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): ): self.updated = True - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def api_request(self, url: str) -> List[str]: diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index b134303..e9c36fa 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -14,7 +14,7 @@ from lazr.restfulclient.errors import RestfulError from lazr.restfulclient.resource import Collection from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import retry_if_exception, throttling_retry +from swh.lister.utils import http_retry, retry_if_exception from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -100,7 +100,7 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): d[attribute_name] = date_last_modified.isoformat() return d - @throttling_retry( + @http_retry( retry=retry_if_restful_error, before_sleep=before_sleep_log(logger, logging.WARNING), ) diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 2560feb..563c0a7 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -16,7 +16,7 @@ import requests from tenacity.before_sleep import before_sleep_log from swh.core.github.utils import GitHubSession -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -112,7 +112,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]): def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 6a75a99..505afa3 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -125,6 +125,11 @@ def network_requests_mock( requests_mock.get(URL_POM_3, content=maven_pom_3) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(MavenLister.page_request.retry, "sleep") + + def test_maven_full_listing(swh_scheduler): """Covers full listing of multiple pages, checking page results and listed origins, statelessness.""" diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index dfc6561..8d48873 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -12,7 +12,7 @@ from tenacity.before_sleep import before_sleep_log from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -95,7 +95,7 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): params["startkey"] = last_package_id return params - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, last_package_id: str) -> requests.Response: params = self.request_params(last_package_id) logger.debug("Fetching URL %s with params %s", self.url, params) diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index 1c20b33..2d41b59 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -35,6 +35,11 @@ def npm_incremental_listing_page2(datadir): return json.loads(Path(datadir, "npm_incremental_page2.json").read_text()) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(NpmLister.page_request.retry, "sleep") + + def _check_listed_npm_packages(lister, packages, scheduler_origins): for package in packages: package_name = package["doc"]["name"] diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index a17ad0e..77cb4de 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -10,7 +10,7 @@ import requests from requests.exceptions import HTTPError from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,7 +60,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]): } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.debug("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index eefd797..443c21d 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -13,7 +13,7 @@ from xmlrpc.client import Fault, ServerProxy from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -88,7 +88,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]: return asdict(state) - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_last_serial(self, client: ServerProxy) -> int: @@ -97,7 +97,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): assert isinstance(serial, int) return serial - @throttling_retry( + @http_retry( retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING) ) def _changelog_since_serial( diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index dcc30c3..8bc56ee 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -18,7 +18,7 @@ import requests from tenacity.before_sleep import before_sleep_log from swh.core.api.classes import stream_results -from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -208,8 +208,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): self._project_last_modified = listed_origins return listed_origins - @throttling_retry( - retry=retry_policy_generic, + @http_retry( before_sleep=before_sleep_log(logger, logging.WARNING), ) def page_request(self, url, params) -> requests.Response: diff --git a/swh/lister/tests/test_utils.py b/swh/lister/tests/test_utils.py index 6d9b50d..98b376f 100644 --- a/swh/lister/tests/test_utils.py +++ b/swh/lister/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,12 +7,7 @@ import requests from requests.status_codes import codes from tenacity.wait import wait_fixed -from swh.lister.utils import ( - MAX_NUMBER_ATTEMPTS, - WAIT_EXP_BASE, - split_range, - throttling_retry, -) +from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range @pytest.mark.parametrize( @@ -51,7 +46,7 @@ def test_split_range_errors(total_pages, nb_pages): TEST_URL = "https://example.og/api/repositories" -@throttling_retry() +@http_retry() def make_request(): response = requests.get(TEST_URL) response.raise_for_status() @@ -62,13 +57,22 @@ def assert_sleep_calls(mocker, mock_sleep, sleep_params): mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params]) -def test_throttling_retry(requests_mock, mocker): +@pytest.mark.parametrize( + "status_code", + [ + codes.too_many_requests, + codes.internal_server_error, + codes.bad_gateway, + codes.service_unavailable, + ], +) +def test_http_retry(requests_mock, mocker, status_code): data = {"result": {}} requests_mock.get( TEST_URL, [ - {"status_code": codes.too_many_requests}, - {"status_code": codes.too_many_requests}, + {"status_code": status_code}, + {"status_code": status_code}, {"status_code": codes.ok, "json": data}, ], ) @@ -82,7 +86,7 @@ def test_throttling_retry(requests_mock, mocker): assert response.json() == data -def test_throttling_retry_max_attemps(requests_mock, mocker): +def test_http_retry_max_attemps(requests_mock, mocker): requests_mock.get( TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS), @@ -102,14 +106,14 @@ def test_throttling_retry_max_attemps(requests_mock, mocker): ) -@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE)) +@http_retry(wait=wait_fixed(WAIT_EXP_BASE)) def make_request_wait_fixed(): response = requests.get(TEST_URL) response.raise_for_status() return response -def test_throttling_retry_wait_fixed(requests_mock, mocker): +def test_http_retry_wait_fixed(requests_mock, mocker): requests_mock.get( TEST_URL, [ diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py index 179329a..7e0b800 100644 --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -11,7 +11,7 @@ import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import throttling_retry +from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -65,7 +65,7 @@ class TuleapLister(StatelessLister[RepoPage]): } ) - @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: logger.info("Fetching URL %s with params %s", url, params) diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py index 16d0c7a..d650dc8 100644 --- a/swh/lister/tuleap/tests/test_lister.py +++ b/swh/lister/tuleap/tests/test_lister.py @@ -92,6 +92,11 @@ def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[st return text, headers, page_results, origin_urls +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(TuleapLister.page_request.retry, "sleep") + + def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): """Asserts that the two collections have the same origin URLs. diff --git a/swh/lister/utils.py b/swh/lister/utils.py index ea4a989..125b31b 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -66,14 +66,6 @@ def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> b return False -def retry_if_throttling(retry_state) -> bool: - """ - Custom tenacity retry predicate for handling HTTP responses with - status code 429 (too many requests). - """ - return retry_if_exception(retry_state, is_throttling_exception) - - def retry_policy_generic(retry_state) -> bool: """ Custom tenacity retry predicate for handling failed requests: @@ -90,8 +82,8 @@ WAIT_EXP_BASE = 10 MAX_NUMBER_ATTEMPTS = 5 -def throttling_retry( - retry=retry_if_throttling, +def http_retry( + retry=retry_policy_generic, wait=wait_exponential(exp_base=WAIT_EXP_BASE), stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS), **retry_args,