Use generic HTTP retry policy by default and rename dedicated decorator
Instead of retrying HTTP requests only for 429 status code by default, prefer to use the generic retry policy enabling to also retry for status codes >= 500 but also on ConnectionError exceptions. Rename throttling_retry decorator to http_retry to reflect this change.
This commit is contained in:
parent
9b3e565cf7
commit
9c55acd286
23 changed files with 71 additions and 62 deletions
|
@ -11,7 +11,7 @@ from urllib.parse import urljoin
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -77,11 +77,11 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
|
|||
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url, params) -> requests.Response:
|
||||
# Do the network resource request under a retrying decorator
|
||||
# to handle rate limiting and transient errors up to a limit.
|
||||
# `throttling_retry` by default use the `requests` library to check
|
||||
# `http_retry` by default use the `requests` library to check
|
||||
# only for rate-limit and a base-10 exponential waiting strategy.
|
||||
# This can be customized by passed waiting, retrying and logging strategies
|
||||
# as functions. See the `tenacity` library documentation.
|
||||
|
|
|
@ -317,7 +317,7 @@ We generally recommend logging every unhandleable error with the response conten
|
|||
then immediately stop the listing by doing an equivalent of
|
||||
:py:meth:`Response.raise_for_status` from the ``requests`` library. As for rate-limiting
|
||||
errors, we have a strategy of using a flexible decorator to handle the retrying for us.
|
||||
It is based on the ``tenacity`` library and accessible as :py:func:`throttling_retry` from
|
||||
It is based on the ``tenacity`` library and accessible as :py:func:`http_retry` from
|
||||
:py:mod:`swh.lister.utils`.
|
||||
|
||||
Pagination
|
||||
|
|
|
@ -16,7 +16,7 @@ from bs4 import BeautifulSoup
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.model.hashutil import hash_to_hex
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -132,7 +132,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
|
|||
}
|
||||
)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
|
|
|
@ -14,7 +14,7 @@ import iso8601
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -107,7 +107,7 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
|
|||
if username is not None and password is not None:
|
||||
self.session.auth = (username, password)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
def page_request(self, last_repo_cdate: str) -> requests.Response:
|
||||
|
||||
self.url_params["after"] = last_repo_cdate
|
||||
|
|
|
@ -8,7 +8,7 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -49,7 +49,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
|
|||
}
|
||||
)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
|
|
@ -15,7 +15,7 @@ from tenacity.before_sleep import before_sleep_log
|
|||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -79,7 +79,7 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
)
|
||||
self.base_git_url = base_git_url
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
|
||||
def _get_and_parse(self, url: str) -> BeautifulSoup:
|
||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
|
||||
response = self.session.get(url)
|
||||
|
|
|
@ -138,7 +138,7 @@ def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1):
|
|||
assert stats.pages == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("http_code", [400, 500, 502])
|
||||
@pytest.mark.parametrize("http_code", [400, 500])
|
||||
def test_gitea_list_http_error(
|
||||
swh_scheduler, requests_mock, http_code, trygitea_p1, trygitea_p2
|
||||
):
|
||||
|
|
|
@ -17,7 +17,7 @@ from tenacity.before_sleep import before_sleep_log
|
|||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, Lister
|
||||
from swh.lister.utils import is_retryable_exception, throttling_retry
|
||||
from swh.lister.utils import http_retry, is_retryable_exception
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -138,7 +138,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@throttling_retry(
|
||||
@http_retry(
|
||||
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
|
||||
)
|
||||
def get_page_result(self, url: str) -> PageResult:
|
||||
|
|
|
@ -12,7 +12,7 @@ import iso8601
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -116,7 +116,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(
|
||||
self, url: str, params: Dict[str, Any]
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
|
|
|
@ -177,7 +177,7 @@ def test_gogs_auth_instance(
|
|||
assert stats.origins == 6
|
||||
|
||||
|
||||
@pytest.mark.parametrize("http_code", [400, 500, 502])
|
||||
@pytest.mark.parametrize("http_code", [400, 500])
|
||||
def test_gogs_list_http_error(
|
||||
swh_scheduler, requests_mock, http_code, trygogs_p1, trygogs_p3_last
|
||||
):
|
||||
|
|
|
@ -13,7 +13,7 @@ import iso8601
|
|||
import requests
|
||||
from tenacity import before_sleep_log
|
||||
|
||||
from swh.lister.utils import retry_policy_generic, throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -87,8 +87,7 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
|
|||
):
|
||||
self.updated = True
|
||||
|
||||
@throttling_retry(
|
||||
retry=retry_policy_generic,
|
||||
@http_retry(
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
def api_request(self, url: str) -> List[str]:
|
||||
|
|
|
@ -14,7 +14,7 @@ from lazr.restfulclient.errors import RestfulError
|
|||
from lazr.restfulclient.resource import Collection
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import retry_if_exception, throttling_retry
|
||||
from swh.lister.utils import http_retry, retry_if_exception
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -100,7 +100,7 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]):
|
|||
d[attribute_name] = date_last_modified.isoformat()
|
||||
return d
|
||||
|
||||
@throttling_retry(
|
||||
@http_retry(
|
||||
retry=retry_if_restful_error,
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
|
|
|
@ -16,7 +16,7 @@ import requests
|
|||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -112,7 +112,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
|
|
@ -125,6 +125,11 @@ def network_requests_mock(
|
|||
requests_mock.get(URL_POM_3, content=maven_pom_3)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(MavenLister.page_request.retry, "sleep")
|
||||
|
||||
|
||||
def test_maven_full_listing(swh_scheduler):
|
||||
"""Covers full listing of multiple pages, checking page results and listed
|
||||
origins, statelessness."""
|
||||
|
|
|
@ -12,7 +12,7 @@ from tenacity.before_sleep import before_sleep_log
|
|||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, Lister
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -95,7 +95,7 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
|
|||
params["startkey"] = last_package_id
|
||||
return params
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, last_package_id: str) -> requests.Response:
|
||||
params = self.request_params(last_package_id)
|
||||
logger.debug("Fetching URL %s with params %s", self.url, params)
|
||||
|
|
|
@ -35,6 +35,11 @@ def npm_incremental_listing_page2(datadir):
|
|||
return json.loads(Path(datadir, "npm_incremental_page2.json").read_text())
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(NpmLister.page_request.retry, "sleep")
|
||||
|
||||
|
||||
def _check_listed_npm_packages(lister, packages, scheduler_origins):
|
||||
for package in packages:
|
||||
package_name = package["doc"]["name"]
|
||||
|
|
|
@ -10,7 +10,7 @@ import requests
|
|||
from requests.exceptions import HTTPError
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -60,7 +60,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
}
|
||||
)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.debug("Fetching URL %s with params %s", url, params)
|
||||
|
|
|
@ -13,7 +13,7 @@ from xmlrpc.client import Fault, ServerProxy
|
|||
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -88,7 +88,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
|
|||
def state_to_dict(self, state: PyPIListerState) -> Dict[str, Any]:
|
||||
return asdict(state)
|
||||
|
||||
@throttling_retry(
|
||||
@http_retry(
|
||||
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
|
||||
)
|
||||
def _changelog_last_serial(self, client: ServerProxy) -> int:
|
||||
|
@ -97,7 +97,7 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
|
|||
assert isinstance(serial, int)
|
||||
return serial
|
||||
|
||||
@throttling_retry(
|
||||
@http_retry(
|
||||
retry=_if_rate_limited, before_sleep=before_sleep_log(logger, logging.WARNING)
|
||||
)
|
||||
def _changelog_since_serial(
|
||||
|
|
|
@ -18,7 +18,7 @@ import requests
|
|||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.api.classes import stream_results
|
||||
from swh.lister.utils import retry_policy_generic, throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -208,8 +208,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
self._project_last_modified = listed_origins
|
||||
return listed_origins
|
||||
|
||||
@throttling_retry(
|
||||
retry=retry_policy_generic,
|
||||
@http_retry(
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
def page_request(self, url, params) -> requests.Response:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2021 the Software Heritage developers
|
||||
# Copyright (C) 2018-2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -7,12 +7,7 @@ import requests
|
|||
from requests.status_codes import codes
|
||||
from tenacity.wait import wait_fixed
|
||||
|
||||
from swh.lister.utils import (
|
||||
MAX_NUMBER_ATTEMPTS,
|
||||
WAIT_EXP_BASE,
|
||||
split_range,
|
||||
throttling_retry,
|
||||
)
|
||||
from swh.lister.utils import MAX_NUMBER_ATTEMPTS, WAIT_EXP_BASE, http_retry, split_range
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -51,7 +46,7 @@ def test_split_range_errors(total_pages, nb_pages):
|
|||
TEST_URL = "https://example.og/api/repositories"
|
||||
|
||||
|
||||
@throttling_retry()
|
||||
@http_retry()
|
||||
def make_request():
|
||||
response = requests.get(TEST_URL)
|
||||
response.raise_for_status()
|
||||
|
@ -62,13 +57,22 @@ def assert_sleep_calls(mocker, mock_sleep, sleep_params):
|
|||
mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
|
||||
|
||||
|
||||
def test_throttling_retry(requests_mock, mocker):
|
||||
@pytest.mark.parametrize(
|
||||
"status_code",
|
||||
[
|
||||
codes.too_many_requests,
|
||||
codes.internal_server_error,
|
||||
codes.bad_gateway,
|
||||
codes.service_unavailable,
|
||||
],
|
||||
)
|
||||
def test_http_retry(requests_mock, mocker, status_code):
|
||||
data = {"result": {}}
|
||||
requests_mock.get(
|
||||
TEST_URL,
|
||||
[
|
||||
{"status_code": codes.too_many_requests},
|
||||
{"status_code": codes.too_many_requests},
|
||||
{"status_code": status_code},
|
||||
{"status_code": status_code},
|
||||
{"status_code": codes.ok, "json": data},
|
||||
],
|
||||
)
|
||||
|
@ -82,7 +86,7 @@ def test_throttling_retry(requests_mock, mocker):
|
|||
assert response.json() == data
|
||||
|
||||
|
||||
def test_throttling_retry_max_attemps(requests_mock, mocker):
|
||||
def test_http_retry_max_attemps(requests_mock, mocker):
|
||||
requests_mock.get(
|
||||
TEST_URL,
|
||||
[{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS),
|
||||
|
@ -102,14 +106,14 @@ def test_throttling_retry_max_attemps(requests_mock, mocker):
|
|||
)
|
||||
|
||||
|
||||
@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE))
|
||||
@http_retry(wait=wait_fixed(WAIT_EXP_BASE))
|
||||
def make_request_wait_fixed():
|
||||
response = requests.get(TEST_URL)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
|
||||
def test_throttling_retry_wait_fixed(requests_mock, mocker):
|
||||
def test_http_retry_wait_fixed(requests_mock, mocker):
|
||||
requests_mock.get(
|
||||
TEST_URL,
|
||||
[
|
||||
|
|
|
@ -11,7 +11,7 @@ import iso8601
|
|||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.lister.utils import http_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -65,7 +65,7 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
}
|
||||
)
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
|
|
@ -92,6 +92,11 @@ def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[st
|
|||
return text, headers, page_results, origin_urls
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def retry_sleep_mock(mocker):
|
||||
mocker.patch.object(TuleapLister.page_request.retry, "sleep")
|
||||
|
||||
|
||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
|
||||
"""Asserts that the two collections have the same origin URLs.
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2018-2021 the Software Heritage developers
|
||||
# Copyright (C) 2018-2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
@ -66,14 +66,6 @@ def retry_if_exception(retry_state, predicate: Callable[[Exception], bool]) -> b
|
|||
return False
|
||||
|
||||
|
||||
def retry_if_throttling(retry_state) -> bool:
|
||||
"""
|
||||
Custom tenacity retry predicate for handling HTTP responses with
|
||||
status code 429 (too many requests).
|
||||
"""
|
||||
return retry_if_exception(retry_state, is_throttling_exception)
|
||||
|
||||
|
||||
def retry_policy_generic(retry_state) -> bool:
|
||||
"""
|
||||
Custom tenacity retry predicate for handling failed requests:
|
||||
|
@ -90,8 +82,8 @@ WAIT_EXP_BASE = 10
|
|||
MAX_NUMBER_ATTEMPTS = 5
|
||||
|
||||
|
||||
def throttling_retry(
|
||||
retry=retry_if_throttling,
|
||||
def http_retry(
|
||||
retry=retry_policy_generic,
|
||||
wait=wait_exponential(exp_base=WAIT_EXP_BASE),
|
||||
stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS),
|
||||
**retry_args,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue