lister: Add utility decorator to ease HTTP requests rate limit handling

Add swh.lister.utils.throttling_retry decorator enabling to retry a function that performs an HTTP request who can return a 429 status code. The implementation is based on the tenacity module and it is assumed that the requests library is used when querying an URL. The default wait strategy is based on exponential backoff. The default max number of attempts is set to 5, HTTPError exception will then be reraised. All tenacity.retry parameters can also be overridden in client code.
2021-01-15 12:25:45 +01:00 · 2021-01-15 12:25:45 +01:00 · d1fbccd988
commit d1fbccd988
parent c782275296
3 changed files with 174 additions and 5 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,4 +7,5 @@ xmltodict
 iso8601
 beautifulsoup4
 pytz
-launchpadlib
+launchpadlib
+tenacity
--- a/swh/lister/tests/test_utils.py
+++ b/swh/lister/tests/test_utils.py
@ -3,9 +3,17 @@
 # See top-level LICENSE file for more information

 import pytest
+import requests
+from requests.status_codes import codes
+from tenacity.wait import wait_fixed
 from testing.postgresql import Postgresql

-from swh.lister import utils
+from swh.lister.utils import (
+    MAX_NUMBER_ATTEMPTS,
+    WAIT_EXP_BASE,
+    split_range,
+    throttling_retry,
+)


@pytest.mark.parametrize(
@ -18,7 +26,7 @@ from swh.lister import utils
    ],
 )
 def test_split_range(total_pages, nb_pages, expected_ranges):
-    actual_ranges = list(utils.split_range(total_pages, nb_pages))
+    actual_ranges = list(split_range(total_pages, nb_pages))
    assert actual_ranges == expected_ranges


@ -26,7 +34,7 @@ def test_split_range(total_pages, nb_pages, expected_ranges):
 def test_split_range_errors(total_pages, nb_pages):
    for total_pages, nb_pages in [(None, 1), (100, None)]:
        with pytest.raises(TypeError):
-            next(utils.split_range(total_pages, nb_pages))
+            next(split_range(total_pages, nb_pages))


 def init_db():
@ -39,3 +47,87 @@ def init_db():
    initdb_args = Postgresql.DEFAULT_SETTINGS["initdb_args"]
    initdb_args = " ".join([initdb_args, "-E UTF-8"])
    return Postgresql(initdb_args=initdb_args)
+
+
+TEST_URL = "https://example.og/api/repositories"
+
+
+@throttling_retry()
+def make_request():
+    response = requests.get(TEST_URL)
+    response.raise_for_status()
+    return response
+
+
+def _assert_sleep_calls(mocker, mock_sleep, sleep_params):
+    try:
+        mock_sleep.assert_has_calls([mocker.call(param) for param in sleep_params])
+    except AssertionError:
+        # tenacity < 5.1 has a different behavior for wait_exponential
+        # https://github.com/jd/tenacity/commit/aac4307a0aa30d7befd0ebe4212ee4fc69083a95
+        mock_sleep.assert_has_calls(
+            [mocker.call(param * WAIT_EXP_BASE) for param in sleep_params]
+        )
+
+
+def test_throttling_retry(requests_mock, mocker):
+    data = {"result": {}}
+    requests_mock.get(
+        TEST_URL,
+        [
+            {"status_code": codes.too_many_requests},
+            {"status_code": codes.too_many_requests},
+            {"status_code": codes.ok, "json": data},
+        ],
+    )
+
+    mock_sleep = mocker.patch.object(make_request.retry, "sleep")
+
+    response = make_request()
+
+    _assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE])
+
+    assert response.json() == data
+
+
+def test_throttling_retry_max_attemps(requests_mock, mocker):
+    requests_mock.get(
+        TEST_URL, [{"status_code": codes.too_many_requests}] * (MAX_NUMBER_ATTEMPTS),
+    )
+
+    mock_sleep = mocker.patch.object(make_request.retry, "sleep")
+
+    with pytest.raises(requests.exceptions.HTTPError) as e:
+        make_request()
+
+    assert e.value.response.status_code == codes.too_many_requests
+
+    _assert_sleep_calls(
+        mocker,
+        mock_sleep,
+        [float(WAIT_EXP_BASE ** i) for i in range(MAX_NUMBER_ATTEMPTS - 1)],
+    )
+
+
+@throttling_retry(wait=wait_fixed(WAIT_EXP_BASE))
+def make_request_wait_fixed():
+    response = requests.get(TEST_URL)
+    response.raise_for_status()
+    return response
+
+
+def test_throttling_retry_wait_fixed(requests_mock, mocker):
+    requests_mock.get(
+        TEST_URL,
+        [
+            {"status_code": codes.too_many_requests},
+            {"status_code": codes.too_many_requests},
+            {"status_code": codes.ok},
+        ],
+    )
+
+    mock_sleep = mocker.patch.object(make_request_wait_fixed.retry, "sleep")
+
+    make_request_wait_fixed()
+
+    _assert_sleep_calls(mocker, mock_sleep, [WAIT_EXP_BASE] * 2)
--- a/swh/lister/utils.py
+++ b/swh/lister/utils.py
@ -1,9 +1,15 @@
-# Copyright (C) 2018-2020 the Software Heritage developers
+# Copyright (C) 2018-2021 the Software Heritage developers
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 from typing import Iterator, Tuple

+from requests.exceptions import HTTPError
+from requests.status_codes import codes
+from tenacity import retry as tenacity_retry
+from tenacity.stop import stop_after_attempt
+from tenacity.wait import wait_exponential
+

 def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]:
    """Split `total_pages` into mostly `nb_pages` ranges. In some cases, the last range can
@ -27,3 +33,73 @@ def split_range(total_pages: int, nb_pages: int) -> Iterator[Tuple[int, int]]:

    if index != total_pages:
        yield index, total_pages
+
+
+def is_throttling_exception(e: Exception) -> bool:
+    """
+    Checks if an exception is a requests.exception.HTTPError for
+    a response with status code 429 (too many requests).
+    """
+    return (
+        isinstance(e, HTTPError) and e.response.status_code == codes.too_many_requests
+    )
+
+
+def retry_attempt(retry_state):
+    """
+    Utility function to get last retry attempt info based on the
+    tenacity version (as debian buster packages version 4.12).
+    """
+    try:
+        attempt = retry_state.outcome
+    except AttributeError:
+        # tenacity < 5.0
+        attempt = retry_state
+    return attempt
+
+
+def retry_if_throttling(retry_state) -> bool:
+    """
+    Custom tenacity retry predicate for handling HTTP responses with
+    status code 429 (too many requests).
+    """
+    attempt = retry_attempt(retry_state)
+    if attempt.failed:
+        exception = attempt.exception()
+        return is_throttling_exception(exception)
+    return False
+
+
+WAIT_EXP_BASE = 10
+MAX_NUMBER_ATTEMPTS = 5
+
+
+def throttling_retry(
+    retry=retry_if_throttling,
+    wait=wait_exponential(exp_base=WAIT_EXP_BASE),
+    stop=stop_after_attempt(max_attempt_number=MAX_NUMBER_ATTEMPTS),
+    **retry_args,
+):
+    """
+    Decorator based on `tenacity` for retrying a function possibly raising
+    requests.exception.HTTPError for status code 429 (too many requests).
+
+    It provides a default configuration that should work properly in most
+    cases but all `tenacity.retry` parameters can also be overridden in client
+    code.
+
+    When the mmaximum of attempts is reached, the HTTPError exception will then
+    be reraised.
+
+    Args:
+        retry: function defining request retry condition (default to 429 status code)
+            https://tenacity.readthedocs.io/en/latest/#whether-to-retry
+
+        wait: function defining wait strategy before retrying (default to exponential
+            backoff) https://tenacity.readthedocs.io/en/latest/#waiting-before-retrying
+
+        stop: function defining when to stop retrying (default after 5 attempts)
+            https://tenacity.readthedocs.io/en/latest/#stopping
+
+    """
+    return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)