Use swh.core.github.pytest_plugin in github tests

Related to T4232
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-05-20 15:59:13 +02:00
parent aa8c8cb3bc
commit 2ffe9c2aea
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
5 changed files with 11 additions and 348 deletions

View file

@ -1,10 +1,10 @@
# Copyright (C) 2020-2021 The Software Heritage developers
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
pytest_plugins = ["swh.scheduler.pytest_plugin"]
pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]
os.environ["LC_ALL"] = "C.UTF-8"

View file

@ -1,2 +1,2 @@
swh.core[db] >= 0.9
swh.core[db,github] >= 2.6
swh.scheduler >= 0.8

View file

@ -11,12 +11,12 @@ from urllib.parse import parse_qs, urlparse
import iso8601
from swh.core.github.utils import GitHubSession, MissingRateLimitReset
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
from .utils import GitHubSession, MissingRateLimitReset
logger = logging.getLogger(__name__)

View file

@ -1,16 +1,16 @@
# Copyright (C) 2020 The Software Heritage developers
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import time
from typing import Any, Dict, Iterator, List, Optional, Union
from typing import Any, Dict, Iterator, List
import pytest
import requests_mock
from swh.core.github.pytest_plugin import github_response_callback
from swh.lister.github.lister import GitHubLister
from swh.lister.pattern import CredentialsType, ListerStats
from swh.scheduler.interface import SchedulerInterface
@ -20,51 +20,6 @@ NUM_PAGES = 10
ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES
def github_repo(i: int) -> Dict[str, Union[int, str]]:
"""Basic repository information returned by the GitHub API"""
repo: Dict[str, Union[int, str]] = {
"id": i,
"html_url": f"https://github.com/origin/{i}",
}
# Set the pushed_at date on one of the origins
if i == 4321:
repo["pushed_at"] = "2018-11-08T13:16:24Z"
return repo
def github_response_callback(
request: requests_mock.request._RequestObjectProxy,
context: requests_mock.response._Context,
) -> List[Dict[str, Union[str, int]]]:
"""Return minimal GitHub API responses for the common case where the loader
hasn't been rate-limited"""
# Check request headers
assert request.headers["Accept"] == "application/vnd.github.v3+json"
assert "Software Heritage Lister" in request.headers["User-Agent"]
# Check request parameters: per_page == 1000, since = last_repo_id
assert "per_page" in request.qs
assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)]
assert "since" in request.qs
since = int(request.qs["since"][0])
next_page = since + GitHubLister.PAGE_SIZE
if next_page < ORIGIN_COUNT:
# the first id for the next page is within our origin count; add a Link
# header to the response
next_url = (
GitHubLister.API_URL
+ f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}"
)
context.headers["Link"] = f"<{next_url}>; rel=next"
return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)]
@pytest.fixture()
def requests_mocker() -> Iterator[requests_mock.Mocker]:
with requests_mock.Mocker() as mock:
@ -182,93 +137,8 @@ def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
assert lister_data.current_state == {"last_seen_id": 123}
def github_ratelimit_callback(
request: requests_mock.request._RequestObjectProxy,
context: requests_mock.response._Context,
ratelimit_reset: Optional[int],
) -> Dict[str, str]:
"""Return a rate-limited GitHub API response."""
# Check request headers
assert request.headers["Accept"] == "application/vnd.github.v3+json"
assert "Software Heritage Lister" in request.headers["User-Agent"]
if "Authorization" in request.headers:
context.status_code = 429
else:
context.status_code = 403
if ratelimit_reset is not None:
context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset)
return {
"message": "API rate limit exceeded for <IP>.",
"documentation_url": "https://developer.github.com/v3/#rate-limiting",
}
@pytest.fixture()
def num_before_ratelimit() -> int:
"""Number of successful requests before the ratelimit hits"""
return 0
@pytest.fixture()
def num_ratelimit() -> Optional[int]:
"""Number of rate-limited requests; None means infinity"""
return None
@pytest.fixture()
def ratelimit_reset() -> Optional[int]:
"""Value of the X-Ratelimit-Reset header on ratelimited responses"""
return None
@pytest.fixture()
def requests_ratelimited(
num_before_ratelimit: int,
num_ratelimit: Optional[int],
ratelimit_reset: Optional[int],
) -> Iterator[requests_mock.Mocker]:
"""Mock requests to the GitHub API, returning a rate-limiting status code
after `num_before_ratelimit` requests.
GitHub does inconsistent rate-limiting:
- Anonymous requests return a 403 status code
- Authenticated requests return a 429 status code, with an
X-Ratelimit-Reset header.
This fixture takes multiple arguments (which can be overridden with a
:func:`pytest.mark.parametrize` parameter):
- num_before_ratelimit: the global number of requests until the
ratelimit triggers
- num_ratelimit: the number of requests that return a
rate-limited response.
- ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the
request is authenticated.
The default values set in the previous fixtures make all requests return a rate
limit response.
"""
current_request = 0
def response_callback(request, context):
nonlocal current_request
current_request += 1
if num_before_ratelimit < current_request and (
num_ratelimit is None
or current_request < num_before_ratelimit + num_ratelimit + 1
):
return github_ratelimit_callback(request, context, ratelimit_reset)
else:
return github_response_callback(request, context)
with requests_mock.Mocker() as mock:
mock.get(GitHubLister.API_URL, json=response_callback)
yield mock
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
lister = GitHubLister(scheduler=swh_scheduler)
assert lister.github_session.anonymous
@ -283,26 +153,6 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non
assert "No X-Ratelimit-Reset value found in responses" in last_log.message
@pytest.fixture
def github_credentials() -> List[Dict[str, str]]:
"""Return a static list of GitHub credentials"""
return sorted(
[{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)]
+ [
{"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"}
for i in range(3)
],
key=lambda c: c["username"],
)
@pytest.fixture
def all_tokens(github_credentials) -> List[str]:
"""Return the list of tokens matching the static credential"""
return [t.get("token", t.get("password")) for t in github_credentials]
@pytest.fixture
def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType:
"""Return the credentials formatted for use by the lister"""
@ -323,29 +173,6 @@ def test_authenticated_credentials(
]
def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None):
"""Record calls to time.sleep in the sleep_calls list"""
if duration < 0:
raise ValueError("Can't sleep for a negative amount of time!")
if sleep_calls is not None:
sleep_calls.append(duration)
def fake_time_time():
"""Return 0 when running time.time()"""
return 0
@pytest.fixture
def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]:
"""Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments
passed to time.sleep()."""
sleeps: List[float] = []
monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps))
monkeypatch.setattr(time, "time", fake_time_time)
yield sleeps
@pytest.mark.parametrize(
"num_ratelimit", [1]
) # return a single rate-limit response, then continue
@ -358,7 +185,7 @@ def test_ratelimit_once_recovery(
lister_credentials,
):
"""Check that the lister recovers from hitting the rate-limit once"""
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
@ -396,7 +223,7 @@ def test_ratelimit_reset_sleep(
):
"""Check that the lister properly handles rate-limiting when providing it with
authentication tokens"""
caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)

View file

@ -3,168 +3,4 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import random
import time
from typing import Dict, List, Optional
import requests
from tenacity import (
retry,
retry_any,
retry_if_exception_type,
retry_if_result,
wait_exponential,
)
logger = logging.getLogger(__name__)
class RateLimited(Exception):
def __init__(self, response):
self.reset_time: Optional[int]
# Figure out how long we need to sleep because of that rate limit
ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
retry_after = response.headers.get("Retry-After")
if ratelimit_reset is not None:
self.reset_time = int(ratelimit_reset)
elif retry_after is not None:
self.reset_time = int(time.time()) + int(retry_after) + 1
else:
logger.warning(
"Received a rate-limit-like status code %s, but no rate-limit "
"headers set. Response content: %s",
response.status_code,
response.content,
)
self.reset_time = None
self.response = response
class MissingRateLimitReset(Exception):
pass
class GitHubSession:
"""Manages a :class:`requests.Session` with (optionally) multiple credentials,
and cycles through them when reaching rate-limits."""
def __init__(
self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None
) -> None:
"""Initialize a requests session with the proper headers for requests to
GitHub."""
self.credentials = credentials
if self.credentials:
random.shuffle(self.credentials)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent}
)
self.anonymous = not self.credentials
if self.anonymous:
logger.warning("No tokens set in configuration, using anonymous mode")
self.token_index = -1
self.current_user: Optional[str] = None
if not self.anonymous:
# Initialize the first token value in the session headers
self.set_next_session_token()
def set_next_session_token(self) -> None:
"""Update the current authentication token with the next one in line."""
assert self.credentials
self.token_index = (self.token_index + 1) % len(self.credentials)
auth = self.credentials[self.token_index]
self.current_user = auth["username"]
logger.debug("Using authentication token for user %s", self.current_user)
if "password" in auth:
token = auth["password"]
else:
token = auth["token"]
self.session.headers.update({"Authorization": f"token {token}"})
@retry(
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_any(
# ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
# when running the lister on a connection with high latency
retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
# 502 status codes happen for a Server Error, sometimes
retry_if_result(lambda r: r.status_code == 502),
),
)
def _request(self, url: str) -> requests.Response:
response = self.session.get(url)
if (
# GitHub returns inconsistent status codes between unauthenticated
# rate limit and authenticated rate limits. Handle both.
response.status_code == 429
or (self.anonymous and response.status_code == 403)
):
raise RateLimited(response)
return response
def request(self, url) -> requests.Response:
"""Repeatedly requests the given URL, cycling through credentials and sleeping
if necessary; until either a successful response or :exc:`MissingRateLimitReset`
"""
# The following for/else loop handles rate limiting; if successful,
# it provides the rest of the function with a `response` object.
#
# If all tokens are rate-limited, we sleep until the reset time,
# then `continue` into another iteration of the outer while loop,
# attempting to get data from the same URL again.
while True:
max_attempts = len(self.credentials) if self.credentials else 1
reset_times: Dict[int, int] = {} # token index -> time
for attempt in range(max_attempts):
try:
return self._request(url)
except RateLimited as e:
reset_info = "(unknown reset)"
if e.reset_time is not None:
reset_times[self.token_index] = e.reset_time
reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
if not self.anonymous:
logger.info(
"Rate limit exhausted for current user %s %s",
self.current_user,
reset_info,
)
# Use next token in line
self.set_next_session_token()
# Wait one second to avoid triggering GitHub's abuse rate limits
time.sleep(1)
# All tokens have been rate-limited. What do we do?
if not reset_times:
logger.warning(
"No X-Ratelimit-Reset value found in responses for any token; "
"Giving up."
)
raise MissingRateLimitReset()
sleep_time = max(reset_times.values()) - time.time() + 1
logger.info(
"Rate limits exhausted for all tokens. Sleeping for %f seconds.",
sleep_time,
)
time.sleep(sleep_time)
from swh.core.github.utils import * # noqa