Use swh.core.github.pytest_plugin in github tests
Related to T4232
This commit is contained in:
parent
aa8c8cb3bc
commit
2ffe9c2aea
5 changed files with 11 additions and 348 deletions
|
@ -1,10 +1,10 @@
|
|||
# Copyright (C) 2020-2021 The Software Heritage developers
|
||||
# Copyright (C) 2020-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import os
|
||||
|
||||
pytest_plugins = ["swh.scheduler.pytest_plugin"]
|
||||
pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]
|
||||
|
||||
os.environ["LC_ALL"] = "C.UTF-8"
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
swh.core[db] >= 0.9
|
||||
swh.core[db,github] >= 2.6
|
||||
swh.scheduler >= 0.8
|
||||
|
|
|
@ -11,12 +11,12 @@ from urllib.parse import parse_qs, urlparse
|
|||
|
||||
import iso8601
|
||||
|
||||
from swh.core.github.utils import GitHubSession, MissingRateLimitReset
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
from .utils import GitHubSession, MissingRateLimitReset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
# Copyright (C) 2020 The Software Heritage developers
|
||||
# Copyright (C) 2020-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from typing import Any, Dict, Iterator, List
|
||||
|
||||
import pytest
|
||||
import requests_mock
|
||||
|
||||
from swh.core.github.pytest_plugin import github_response_callback
|
||||
from swh.lister.github.lister import GitHubLister
|
||||
from swh.lister.pattern import CredentialsType, ListerStats
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -20,51 +20,6 @@ NUM_PAGES = 10
|
|||
ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES
|
||||
|
||||
|
||||
def github_repo(i: int) -> Dict[str, Union[int, str]]:
|
||||
"""Basic repository information returned by the GitHub API"""
|
||||
|
||||
repo: Dict[str, Union[int, str]] = {
|
||||
"id": i,
|
||||
"html_url": f"https://github.com/origin/{i}",
|
||||
}
|
||||
|
||||
# Set the pushed_at date on one of the origins
|
||||
if i == 4321:
|
||||
repo["pushed_at"] = "2018-11-08T13:16:24Z"
|
||||
|
||||
return repo
|
||||
|
||||
|
||||
def github_response_callback(
|
||||
request: requests_mock.request._RequestObjectProxy,
|
||||
context: requests_mock.response._Context,
|
||||
) -> List[Dict[str, Union[str, int]]]:
|
||||
"""Return minimal GitHub API responses for the common case where the loader
|
||||
hasn't been rate-limited"""
|
||||
# Check request headers
|
||||
assert request.headers["Accept"] == "application/vnd.github.v3+json"
|
||||
assert "Software Heritage Lister" in request.headers["User-Agent"]
|
||||
|
||||
# Check request parameters: per_page == 1000, since = last_repo_id
|
||||
assert "per_page" in request.qs
|
||||
assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)]
|
||||
assert "since" in request.qs
|
||||
|
||||
since = int(request.qs["since"][0])
|
||||
|
||||
next_page = since + GitHubLister.PAGE_SIZE
|
||||
if next_page < ORIGIN_COUNT:
|
||||
# the first id for the next page is within our origin count; add a Link
|
||||
# header to the response
|
||||
next_url = (
|
||||
GitHubLister.API_URL
|
||||
+ f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}"
|
||||
)
|
||||
context.headers["Link"] = f"<{next_url}>; rel=next"
|
||||
|
||||
return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def requests_mocker() -> Iterator[requests_mock.Mocker]:
|
||||
with requests_mock.Mocker() as mock:
|
||||
|
@ -182,93 +137,8 @@ def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
|
|||
assert lister_data.current_state == {"last_seen_id": 123}
|
||||
|
||||
|
||||
def github_ratelimit_callback(
|
||||
request: requests_mock.request._RequestObjectProxy,
|
||||
context: requests_mock.response._Context,
|
||||
ratelimit_reset: Optional[int],
|
||||
) -> Dict[str, str]:
|
||||
"""Return a rate-limited GitHub API response."""
|
||||
# Check request headers
|
||||
assert request.headers["Accept"] == "application/vnd.github.v3+json"
|
||||
assert "Software Heritage Lister" in request.headers["User-Agent"]
|
||||
if "Authorization" in request.headers:
|
||||
context.status_code = 429
|
||||
else:
|
||||
context.status_code = 403
|
||||
|
||||
if ratelimit_reset is not None:
|
||||
context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset)
|
||||
|
||||
return {
|
||||
"message": "API rate limit exceeded for <IP>.",
|
||||
"documentation_url": "https://developer.github.com/v3/#rate-limiting",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def num_before_ratelimit() -> int:
|
||||
"""Number of successful requests before the ratelimit hits"""
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def num_ratelimit() -> Optional[int]:
|
||||
"""Number of rate-limited requests; None means infinity"""
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def ratelimit_reset() -> Optional[int]:
|
||||
"""Value of the X-Ratelimit-Reset header on ratelimited responses"""
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def requests_ratelimited(
|
||||
num_before_ratelimit: int,
|
||||
num_ratelimit: Optional[int],
|
||||
ratelimit_reset: Optional[int],
|
||||
) -> Iterator[requests_mock.Mocker]:
|
||||
"""Mock requests to the GitHub API, returning a rate-limiting status code
|
||||
after `num_before_ratelimit` requests.
|
||||
|
||||
GitHub does inconsistent rate-limiting:
|
||||
- Anonymous requests return a 403 status code
|
||||
- Authenticated requests return a 429 status code, with an
|
||||
X-Ratelimit-Reset header.
|
||||
|
||||
This fixture takes multiple arguments (which can be overridden with a
|
||||
:func:`pytest.mark.parametrize` parameter):
|
||||
- num_before_ratelimit: the global number of requests until the
|
||||
ratelimit triggers
|
||||
- num_ratelimit: the number of requests that return a
|
||||
rate-limited response.
|
||||
- ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the
|
||||
request is authenticated.
|
||||
|
||||
The default values set in the previous fixtures make all requests return a rate
|
||||
limit response.
|
||||
"""
|
||||
current_request = 0
|
||||
|
||||
def response_callback(request, context):
|
||||
nonlocal current_request
|
||||
current_request += 1
|
||||
if num_before_ratelimit < current_request and (
|
||||
num_ratelimit is None
|
||||
or current_request < num_before_ratelimit + num_ratelimit + 1
|
||||
):
|
||||
return github_ratelimit_callback(request, context, ratelimit_reset)
|
||||
else:
|
||||
return github_response_callback(request, context)
|
||||
|
||||
with requests_mock.Mocker() as mock:
|
||||
mock.get(GitHubLister.API_URL, json=response_callback)
|
||||
yield mock
|
||||
|
||||
|
||||
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
|
||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
|
||||
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
|
||||
|
||||
lister = GitHubLister(scheduler=swh_scheduler)
|
||||
assert lister.github_session.anonymous
|
||||
|
@ -283,26 +153,6 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non
|
|||
assert "No X-Ratelimit-Reset value found in responses" in last_log.message
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def github_credentials() -> List[Dict[str, str]]:
|
||||
"""Return a static list of GitHub credentials"""
|
||||
return sorted(
|
||||
[{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)]
|
||||
+ [
|
||||
{"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"}
|
||||
for i in range(3)
|
||||
],
|
||||
key=lambda c: c["username"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def all_tokens(github_credentials) -> List[str]:
|
||||
"""Return the list of tokens matching the static credential"""
|
||||
|
||||
return [t.get("token", t.get("password")) for t in github_credentials]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType:
|
||||
"""Return the credentials formatted for use by the lister"""
|
||||
|
@ -323,29 +173,6 @@ def test_authenticated_credentials(
|
|||
]
|
||||
|
||||
|
||||
def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None):
|
||||
"""Record calls to time.sleep in the sleep_calls list"""
|
||||
if duration < 0:
|
||||
raise ValueError("Can't sleep for a negative amount of time!")
|
||||
if sleep_calls is not None:
|
||||
sleep_calls.append(duration)
|
||||
|
||||
|
||||
def fake_time_time():
|
||||
"""Return 0 when running time.time()"""
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]:
|
||||
"""Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments
|
||||
passed to time.sleep()."""
|
||||
sleeps: List[float] = []
|
||||
monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps))
|
||||
monkeypatch.setattr(time, "time", fake_time_time)
|
||||
yield sleeps
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_ratelimit", [1]
|
||||
) # return a single rate-limit response, then continue
|
||||
|
@ -358,7 +185,7 @@ def test_ratelimit_once_recovery(
|
|||
lister_credentials,
|
||||
):
|
||||
"""Check that the lister recovers from hitting the rate-limit once"""
|
||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
|
||||
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
|
||||
|
||||
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
|
||||
|
||||
|
@ -396,7 +223,7 @@ def test_ratelimit_reset_sleep(
|
|||
):
|
||||
"""Check that the lister properly handles rate-limiting when providing it with
|
||||
authentication tokens"""
|
||||
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
|
||||
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
|
||||
|
||||
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
|
||||
|
||||
|
|
|
@ -3,168 +3,4 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_any,
|
||||
retry_if_exception_type,
|
||||
retry_if_result,
|
||||
wait_exponential,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RateLimited(Exception):
|
||||
def __init__(self, response):
|
||||
self.reset_time: Optional[int]
|
||||
|
||||
# Figure out how long we need to sleep because of that rate limit
|
||||
ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
|
||||
retry_after = response.headers.get("Retry-After")
|
||||
if ratelimit_reset is not None:
|
||||
self.reset_time = int(ratelimit_reset)
|
||||
elif retry_after is not None:
|
||||
self.reset_time = int(time.time()) + int(retry_after) + 1
|
||||
else:
|
||||
logger.warning(
|
||||
"Received a rate-limit-like status code %s, but no rate-limit "
|
||||
"headers set. Response content: %s",
|
||||
response.status_code,
|
||||
response.content,
|
||||
)
|
||||
self.reset_time = None
|
||||
self.response = response
|
||||
|
||||
|
||||
class MissingRateLimitReset(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GitHubSession:
|
||||
"""Manages a :class:`requests.Session` with (optionally) multiple credentials,
|
||||
and cycles through them when reaching rate-limits."""
|
||||
|
||||
def __init__(
|
||||
self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None
|
||||
) -> None:
|
||||
"""Initialize a requests session with the proper headers for requests to
|
||||
GitHub."""
|
||||
self.credentials = credentials
|
||||
if self.credentials:
|
||||
random.shuffle(self.credentials)
|
||||
|
||||
self.session = requests.Session()
|
||||
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent}
|
||||
)
|
||||
|
||||
self.anonymous = not self.credentials
|
||||
|
||||
if self.anonymous:
|
||||
logger.warning("No tokens set in configuration, using anonymous mode")
|
||||
|
||||
self.token_index = -1
|
||||
self.current_user: Optional[str] = None
|
||||
|
||||
if not self.anonymous:
|
||||
# Initialize the first token value in the session headers
|
||||
self.set_next_session_token()
|
||||
|
||||
def set_next_session_token(self) -> None:
|
||||
"""Update the current authentication token with the next one in line."""
|
||||
|
||||
assert self.credentials
|
||||
|
||||
self.token_index = (self.token_index + 1) % len(self.credentials)
|
||||
|
||||
auth = self.credentials[self.token_index]
|
||||
|
||||
self.current_user = auth["username"]
|
||||
logger.debug("Using authentication token for user %s", self.current_user)
|
||||
|
||||
if "password" in auth:
|
||||
token = auth["password"]
|
||||
else:
|
||||
token = auth["token"]
|
||||
|
||||
self.session.headers.update({"Authorization": f"token {token}"})
|
||||
|
||||
@retry(
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
retry=retry_any(
|
||||
# ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
|
||||
# when running the lister on a connection with high latency
|
||||
retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
|
||||
# 502 status codes happen for a Server Error, sometimes
|
||||
retry_if_result(lambda r: r.status_code == 502),
|
||||
),
|
||||
)
|
||||
def _request(self, url: str) -> requests.Response:
|
||||
response = self.session.get(url)
|
||||
|
||||
if (
|
||||
# GitHub returns inconsistent status codes between unauthenticated
|
||||
# rate limit and authenticated rate limits. Handle both.
|
||||
response.status_code == 429
|
||||
or (self.anonymous and response.status_code == 403)
|
||||
):
|
||||
raise RateLimited(response)
|
||||
|
||||
return response
|
||||
|
||||
def request(self, url) -> requests.Response:
|
||||
"""Repeatedly requests the given URL, cycling through credentials and sleeping
|
||||
if necessary; until either a successful response or :exc:`MissingRateLimitReset`
|
||||
"""
|
||||
# The following for/else loop handles rate limiting; if successful,
|
||||
# it provides the rest of the function with a `response` object.
|
||||
#
|
||||
# If all tokens are rate-limited, we sleep until the reset time,
|
||||
# then `continue` into another iteration of the outer while loop,
|
||||
# attempting to get data from the same URL again.
|
||||
|
||||
while True:
|
||||
max_attempts = len(self.credentials) if self.credentials else 1
|
||||
reset_times: Dict[int, int] = {} # token index -> time
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return self._request(url)
|
||||
except RateLimited as e:
|
||||
reset_info = "(unknown reset)"
|
||||
if e.reset_time is not None:
|
||||
reset_times[self.token_index] = e.reset_time
|
||||
reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
|
||||
|
||||
if not self.anonymous:
|
||||
logger.info(
|
||||
"Rate limit exhausted for current user %s %s",
|
||||
self.current_user,
|
||||
reset_info,
|
||||
)
|
||||
# Use next token in line
|
||||
self.set_next_session_token()
|
||||
# Wait one second to avoid triggering GitHub's abuse rate limits
|
||||
time.sleep(1)
|
||||
|
||||
# All tokens have been rate-limited. What do we do?
|
||||
|
||||
if not reset_times:
|
||||
logger.warning(
|
||||
"No X-Ratelimit-Reset value found in responses for any token; "
|
||||
"Giving up."
|
||||
)
|
||||
raise MissingRateLimitReset()
|
||||
|
||||
sleep_time = max(reset_times.values()) - time.time() + 1
|
||||
logger.info(
|
||||
"Rate limits exhausted for all tokens. Sleeping for %f seconds.",
|
||||
sleep_time,
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
from swh.core.github.utils import * # noqa
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue