Use generic HTTP retry policy by default and rename dedicated decorator

Instead of retrying HTTP requests only for 429 status code by default,
prefer to use the generic retry policy enabling to also retry for status
codes >= 500 but also on ConnectionError exceptions.

Rename throttling_retry decorator to http_retry to reflect this change.
This commit is contained in:
Antoine Lambert 2022-09-21 16:56:34 +02:00
parent 9b3e565cf7
commit 9c55acd286
23 changed files with 71 additions and 62 deletions

View file

@ -11,7 +11,7 @@ from urllib.parse import urljoin
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -77,11 +77,11 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
# Do the network resource request under a retrying decorator
# to handle rate limiting and transient errors up to a limit.
# `throttling_retry` by default use the `requests` library to check
# `http_retry` by default use the `requests` library to check
# only for rate-limit and a base-10 exponential waiting strategy.
# This can be customized by passed waiting, retrying and logging strategies
# as functions. See the `tenacity` library documentation.

View file

@ -317,7 +317,7 @@ We generally recommend logging every unhandleable error with the response conten
then immediately stop the listing by doing an equivalent of
:py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting
errors, we have a strategy of using a flexible decorator to handle the retrying for us.
It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from
It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from
:py:mod:`swh.lister.utils`.
Pagination

View file

@ -16,7 +16,7 @@ from bs4 import BeautifulSoup
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -132,7 +132,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)

View file

@ -14,7 +14,7 @@ import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -107,7 +107,7 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
if username is not None and password is not None:
self.session.auth = (username, password)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def page_request(self, last_repo_cdate: str) -> requests.Response:
self.url_params["after"] = last_repo_cdate

View file

@ -8,7 +8,7 @@ from typing import Any, Dict, Iterator, List, Optional
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -49,7 +49,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)

View file

@ -15,7 +15,7 @@ from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -79,7 +79,7 @@ class CGitLister(StatelessLister[Repositories]):
)
self.base_git_url = base_git_url
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)

View file

@ -138,7 +138,7 @@ def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1):
assert stats.pages == 1
@pytest.mark.parametrize("http_code", [400, 500, 502])
@pytest.mark.parametrize("http_code", [400, 500])
def test_gitea_list_http_error(
swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2
):

View file

@ -17,7 +17,7 @@ from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
from swh.lister.utils import is_retryable_exception, throttling_retry
from swh.lister.utils import http_retry, is_retryable_exception
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
@ -138,7 +138,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(
@http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def get_page_result(self, url: str) -> PageResult:

View file

@ -12,7 +12,7 @@ import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -116,7 +116,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(
self, url: str, params: Dict[str, Any]
) -> Tuple[Dict[str, Any], Dict[str, Any]]:

View file

@ -177,7 +177,7 @@ def test_gogs_auth_instance(
assert stats.origins == 6
@pytest.mark.parametrize("http_code", [400, 500, 502])
@pytest.mark.parametrize("http_code", [400, 500])
def test_gogs_list_http_error(
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last
):

View file

@ -13,7 +13,7 @@ import iso8601
import requests
from tenacity import before_sleep_log
from swh.lister.utils import retry_policy_generic, throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -87,8 +87,7 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
):
self.updated = True
@throttling_retry(
retry=retry_policy_generic,
@http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def api_request(self, url: str) -> List[str]:

View file

@ -14,7 +14,7 @@ from lazr.restfulclient.errors import RestfulError
from lazr.restfulclient.resource import Collection
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import retry_if_exception, throttling_retry
from swh.lister.utils import http_retry, retry_if_exception
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -100,7 +100,7 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]):
d[attribute_name] = date_last_modified.isoformat()
return d
@throttling_retry(
@http_retry(
retry=retry_if_restful_error,
before_sleep=before_sleep_log(logger, logging.WARNING),
)

View file

@ -16,7 +16,7 @@ import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.github.utils import GitHubSession
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -112,7 +112,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)

View file

@ -125,6 +125,11 @@ def network_requests_mock(
requests_mock.get(URL_POM_3, content=maven_pom_3)
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(MavenLister.page_request.retry, "sleep")
def test_maven_full_listing(swh_scheduler):
"""Covers full listing of multiple pages, checking page results and listed
origins, statelessness."""

View file

@ -12,7 +12,7 @@ from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -95,7 +95,7 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
params["startkey"] = last_package_id
return params
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, last_package_id: str) -> requests.Response:
params = self.request_params(last_package_id)
logger.debug("Fetching URL %s with params %s", self.url, params)

View file

@ -35,6 +35,11 @@ def npm_incremental_listing_page2(datadir):
return json.loads(Path(datadir, "npm_incremental_page2.json").read_text())
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(NpmLister.page_request.retry, "sleep")
def _check_listed_npm_packages(lister, packages, scheduler_origins):
for package in packages:
package_name = package["doc"]["name"]

View file

@ -10,7 +10,7 @@ import requests
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -60,7 +60,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)

View file

@ -13,7 +13,7 @@ from xmlrpc.client import Fault, ServerProxy
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -88,7 +88,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(
@http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def _changelog_last_serial(self, client: ServerProxy) -> int:
@ -97,7 +97,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
assert isinstance(serial, int)
return serial
@throttling_retry(
@http_retry(
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
)
def _changelog_since_serial(

View file

@ -18,7 +18,7 @@ import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.api.classes import stream_results
from swh.lister.utils import retry_policy_generic, throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -208,8 +208,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
self._project_last_modified = listed_origins
return listed_origins
@throttling_retry(
retry=retry_policy_generic,
@http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def page_request(self, url, params) -> requests.Response:

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2021 the Software Heritage developers
# Copyright (C) 2018-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,12 +7,7 @@ import requests
from requests.status_codes import codes
from tenacity.wait import wait_fixed
from swh.lister.utils import (
MAX_NUMBER_ATTEMPTS,
WAIT_EXP_BASE,
split_range,
throttling_retry,
)
from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range
@pytest.mark.parametrize(
@ -51,7 +46,7 @@ def test_split_range_errors(total_pages, nb_pages):
TEST_URL = "https://example.og/api/repositories"
@throttling_retry()
@http_retry()
def make_request():
response = requests.get(TEST_URL)
response.raise_for_status()
@ -62,13 +57,22 @@ def assert_sleep_calls(mocker, mock_sleep, sleep_params):
mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
def test_throttling_retry(requests_mock, mocker):
@pytest.mark.parametrize(
"status_code",
[
codes.too_many_requests,
codes.internal_server_error,
codes.bad_gateway,
codes.service_unavailable,
],
)
def test_http_retry(requests_mock, mocker, status_code):
data = {"result": {}}
requests_mock.get(
TEST_URL,
[
{"status_code": codes.too_many_requests},
{"status_code": codes.too_many_requests},
{"status_code": status_code},
{"status_code": status_code},
{"status_code": codes.ok, "json": data},
],
)
@ -82,7 +86,7 @@ def test_throttling_retry(requests_mock, mocker):
assert response.json() == data
def test_throttling_retry_max_attemps(requests_mock, mocker):
def test_http_retry_max_attemps(requests_mock, mocker):
requests_mock.get(
TEST_URL,
[{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS),
@ -102,14 +106,14 @@ def test_throttling_retry_max_attemps(requests_mock, mocker):
)
@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE))
@http_retry(wait=wait_fixed(WAIT_EXP_BASE))
def make_request_wait_fixed():
response = requests.get(TEST_URL)
response.raise_for_status()
return response
def test_throttling_retry_wait_fixed(requests_mock, mocker):
def test_http_retry_wait_fixed(requests_mock, mocker):
requests_mock.get(
TEST_URL,
[

View file

@ -11,7 +11,7 @@ import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -65,7 +65,7 @@ class TuleapLister(StatelessLister[RepoPage]):
}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)

View file

@ -92,6 +92,11 @@ def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[st
return text, headers, page_results, origin_urls
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(TuleapLister.page_request.retry, "sleep")
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
"""Asserts that the two collections have the same origin URLs.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2021 the Software Heritage developers
# Copyright (C) 2018-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -66,14 +66,6 @@ def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> b
return False
def retry_if_throttling(retry_state) -> bool:
"""
Custom tenacity retry predicate for handling HTTP responses with
status code 429 (too many requests).
"""
return retry_if_exception(retry_state, is_throttling_exception)
def retry_policy_generic(retry_state) -> bool:
"""
Custom tenacity retry predicate for handling failed requests:
@ -90,8 +82,8 @@ WAIT_EXP_BASE = 10
MAX_NUMBER_ATTEMPTS = 5
def throttling_retry(
retry=retry_if_throttling,
def http_retry(
retry=retry_policy_generic,
wait=wait_exponential(exp_base=WAIT_EXP_BASE),
stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS),
**retry_args,