Use swh.core.github.pytest_plugin in github tests

Related to T4232
2022-05-20 15:59:13 +02:00 · 2022-05-20 15:59:13 +02:00 · 2ffe9c2aea
commit 2ffe9c2aea
parent aa8c8cb3bc
5 changed files with 11 additions and 348 deletions
--- a/conftest.py
+++ b/conftest.py
@ -1,10 +1,10 @@
-# Copyright (C) 2020-2021  The Software Heritage developers
+# Copyright (C) 2020-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 import os

-pytest_plugins = ["swh.scheduler.pytest_plugin"]
+pytest_plugins = ["swh.scheduler.pytest_plugin", "swh.core.github.pytest_plugin"]

 os.environ["LC_ALL"] = "C.UTF-8"
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@ -1,2 +1,2 @@
-swh.core[db] >= 0.9
+swh.core[db,github] >= 2.6
 swh.scheduler >= 0.8
--- a/swh/lister/github/lister.py
+++ b/swh/lister/github/lister.py
@ -11,12 +11,12 @@ from urllib.parse import parse_qs, urlparse

 import iso8601

+from swh.core.github.utils import GitHubSession, MissingRateLimitReset
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin

 from .. import USER_AGENT
 from ..pattern import CredentialsType, Lister
-from .utils import GitHubSession, MissingRateLimitReset

 logger = logging.getLogger(__name__)

--- a/swh/lister/github/tests/test_lister.py
+++ b/swh/lister/github/tests/test_lister.py
@ -1,16 +1,16 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 import datetime
 import logging
-import time
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List

 import pytest
 import requests_mock

+from swh.core.github.pytest_plugin import github_response_callback
 from swh.lister.github.lister import GitHubLister
 from swh.lister.pattern import CredentialsType, ListerStats
 from swh.scheduler.interface import SchedulerInterface
@ -20,51 +20,6 @@ NUM_PAGES = 10
 ORIGIN_COUNT = GitHubLister.PAGE_SIZE * NUM_PAGES


-def github_repo(i: int) -> Dict[str, Union[int, str]]:
-    """Basic repository information returned by the GitHub API"""
-
-    repo: Dict[str, Union[int, str]] = {
-        "id": i,
-        "html_url": f"https://github.com/origin/{i}",
-    }
-
-    # Set the pushed_at date on one of the origins
-    if i == 4321:
-        repo["pushed_at"] = "2018-11-08T13:16:24Z"
-
-    return repo
-
-
-def github_response_callback(
-    request: requests_mock.request._RequestObjectProxy,
-    context: requests_mock.response._Context,
-) -> List[Dict[str, Union[str, int]]]:
-    """Return minimal GitHub API responses for the common case where the loader
-    hasn't been rate-limited"""
-    # Check request headers
-    assert request.headers["Accept"] == "application/vnd.github.v3+json"
-    assert "Software Heritage Lister" in request.headers["User-Agent"]
-
-    # Check request parameters: per_page == 1000, since = last_repo_id
-    assert "per_page" in request.qs
-    assert request.qs["per_page"] == [str(GitHubLister.PAGE_SIZE)]
-    assert "since" in request.qs
-
-    since = int(request.qs["since"][0])
-
-    next_page = since + GitHubLister.PAGE_SIZE
-    if next_page < ORIGIN_COUNT:
-        # the first id for the next page is within our origin count; add a Link
-        # header to the response
-        next_url = (
-            GitHubLister.API_URL
-            + f"?per_page={GitHubLister.PAGE_SIZE}&since={next_page}"
-        )
-        context.headers["Link"] = f"<{next_url}>; rel=next"
-
-    return [github_repo(i) for i in range(since + 1, min(next_page, ORIGIN_COUNT) + 1)]
-
-
@pytest.fixture()
 def requests_mocker() -> Iterator[requests_mock.Mocker]:
    with requests_mock.Mocker() as mock:
@ -182,93 +137,8 @@ def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
    assert lister_data.current_state == {"last_seen_id": 123}


-def github_ratelimit_callback(
-    request: requests_mock.request._RequestObjectProxy,
-    context: requests_mock.response._Context,
-    ratelimit_reset: Optional[int],
-) -> Dict[str, str]:
-    """Return a rate-limited GitHub API response."""
-    # Check request headers
-    assert request.headers["Accept"] == "application/vnd.github.v3+json"
-    assert "Software Heritage Lister" in request.headers["User-Agent"]
-    if "Authorization" in request.headers:
-        context.status_code = 429
-    else:
-        context.status_code = 403
-
-    if ratelimit_reset is not None:
-        context.headers["X-Ratelimit-Reset"] = str(ratelimit_reset)
-
-    return {
-        "message": "API rate limit exceeded for <IP>.",
-        "documentation_url": "https://developer.github.com/v3/#rate-limiting",
-    }
-
-
-@pytest.fixture()
-def num_before_ratelimit() -> int:
-    """Number of successful requests before the ratelimit hits"""
-    return 0
-
-
-@pytest.fixture()
-def num_ratelimit() -> Optional[int]:
-    """Number of rate-limited requests; None means infinity"""
-    return None
-
-
-@pytest.fixture()
-def ratelimit_reset() -> Optional[int]:
-    """Value of the X-Ratelimit-Reset header on ratelimited responses"""
-    return None
-
-
-@pytest.fixture()
-def requests_ratelimited(
-    num_before_ratelimit: int,
-    num_ratelimit: Optional[int],
-    ratelimit_reset: Optional[int],
-) -> Iterator[requests_mock.Mocker]:
-    """Mock requests to the GitHub API, returning a rate-limiting status code
-    after `num_before_ratelimit` requests.
-
-    GitHub does inconsistent rate-limiting:
-      - Anonymous requests return a 403 status code
-      - Authenticated requests return a 429 status code, with an
-        X-Ratelimit-Reset header.
-
-    This fixture takes multiple arguments (which can be overridden with a
-    :func:`pytest.mark.parametrize` parameter):
-     - num_before_ratelimit: the global number of requests until the
-       ratelimit triggers
-     - num_ratelimit: the number of requests that return a
-       rate-limited response.
-     - ratelimit_reset: the timestamp returned in X-Ratelimit-Reset if the
-       request is authenticated.
-
-    The default values set in the previous fixtures make all requests return a rate
-    limit response.
-    """
-    current_request = 0
-
-    def response_callback(request, context):
-        nonlocal current_request
-        current_request += 1
-        if num_before_ratelimit < current_request and (
-            num_ratelimit is None
-            or current_request < num_before_ratelimit + num_ratelimit + 1
-        ):
-            return github_ratelimit_callback(request, context, ratelimit_reset)
-        else:
-            return github_response_callback(request, context)
-
-    with requests_mock.Mocker() as mock:
-        mock.get(GitHubLister.API_URL, json=response_callback)
-        yield mock
-
-
 def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler)
    assert lister.github_session.anonymous
@ -283,26 +153,6 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non
    assert "No X-Ratelimit-Reset value found in responses" in last_log.message


-@pytest.fixture
-def github_credentials() -> List[Dict[str, str]]:
-    """Return a static list of GitHub credentials"""
-    return sorted(
-        [{"username": f"swh{i:d}", "token": f"token-{i:d}"} for i in range(3)]
-        + [
-            {"username": f"swh-legacy{i:d}", "password": f"token-legacy-{i:d}"}
-            for i in range(3)
-        ],
-        key=lambda c: c["username"],
-    )
-
-
-@pytest.fixture
-def all_tokens(github_credentials) -> List[str]:
-    """Return the list of tokens matching the static credential"""
-
-    return [t.get("token", t.get("password")) for t in github_credentials]
-
-
@pytest.fixture
 def lister_credentials(github_credentials: List[Dict[str, str]]) -> CredentialsType:
    """Return the credentials formatted for use by the lister"""
@ -323,29 +173,6 @@ def test_authenticated_credentials(
    ]


-def fake_time_sleep(duration: float, sleep_calls: Optional[List[float]] = None):
-    """Record calls to time.sleep in the sleep_calls list"""
-    if duration < 0:
-        raise ValueError("Can't sleep for a negative amount of time!")
-    if sleep_calls is not None:
-        sleep_calls.append(duration)
-
-
-def fake_time_time():
-    """Return 0 when running time.time()"""
-    return 0
-
-
-@pytest.fixture
-def monkeypatch_sleep_calls(monkeypatch) -> Iterator[List[float]]:
-    """Monkeypatch `time.time` and `time.sleep`. Returns a list cumulating the arguments
-    passed to time.sleep()."""
-    sleeps: List[float] = []
-    monkeypatch.setattr(time, "sleep", lambda d: fake_time_sleep(d, sleeps))
-    monkeypatch.setattr(time, "time", fake_time_time)
-    yield sleeps
-
-
@pytest.mark.parametrize(
    "num_ratelimit", [1]
 )  # return a single rate-limit response, then continue
@ -358,7 +185,7 @@ def test_ratelimit_once_recovery(
    lister_credentials,
 ):
    """Check that the lister recovers from hitting the rate-limit once"""
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)

@ -396,7 +223,7 @@ def test_ratelimit_reset_sleep(
 ):
    """Check that the lister properly handles rate-limiting when providing it with
    authentication tokens"""
-    caplog.set_level(logging.DEBUG, "swh.lister.github.utils")
+    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)

--- a/swh/lister/github/utils.py
+++ b/swh/lister/github/utils.py
@ -3,168 +3,4 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import logging
-import random
-import time
-from typing import Dict, List, Optional
-
-import requests
-from tenacity import (
-    retry,
-    retry_any,
-    retry_if_exception_type,
-    retry_if_result,
-    wait_exponential,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class RateLimited(Exception):
-    def __init__(self, response):
-        self.reset_time: Optional[int]
-
-        # Figure out how long we need to sleep because of that rate limit
-        ratelimit_reset = response.headers.get("X-Ratelimit-Reset")
-        retry_after = response.headers.get("Retry-After")
-        if ratelimit_reset is not None:
-            self.reset_time = int(ratelimit_reset)
-        elif retry_after is not None:
-            self.reset_time = int(time.time()) + int(retry_after) + 1
-        else:
-            logger.warning(
-                "Received a rate-limit-like status code %s, but no rate-limit "
-                "headers set. Response content: %s",
-                response.status_code,
-                response.content,
-            )
-            self.reset_time = None
-        self.response = response
-
-
-class MissingRateLimitReset(Exception):
-    pass
-
-
-class GitHubSession:
-    """Manages a :class:`requests.Session` with (optionally) multiple credentials,
-    and cycles through them when reaching rate-limits."""
-
-    def __init__(
-        self, user_agent: str, credentials: Optional[List[Dict[str, str]]] = None
-    ) -> None:
-        """Initialize a requests session with the proper headers for requests to
-        GitHub."""
-        self.credentials = credentials
-        if self.credentials:
-            random.shuffle(self.credentials)
-
-        self.session = requests.Session()
-
-        self.session.headers.update(
-            {"Accept": "application/vnd.github.v3+json", "User-Agent": user_agent}
-        )
-
-        self.anonymous = not self.credentials
-
-        if self.anonymous:
-            logger.warning("No tokens set in configuration, using anonymous mode")
-
-        self.token_index = -1
-        self.current_user: Optional[str] = None
-
-        if not self.anonymous:
-            # Initialize the first token value in the session headers
-            self.set_next_session_token()
-
-    def set_next_session_token(self) -> None:
-        """Update the current authentication token with the next one in line."""
-
-        assert self.credentials
-
-        self.token_index = (self.token_index + 1) % len(self.credentials)
-
-        auth = self.credentials[self.token_index]
-
-        self.current_user = auth["username"]
-        logger.debug("Using authentication token for user %s", self.current_user)
-
-        if "password" in auth:
-            token = auth["password"]
-        else:
-            token = auth["token"]
-
-        self.session.headers.update({"Authorization": f"token {token}"})
-
-    @retry(
-        wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_any(
-            # ChunkedEncodingErrors happen when the TLS connection gets reset, e.g.
-            # when running the lister on a connection with high latency
-            retry_if_exception_type(requests.exceptions.ChunkedEncodingError),
-            # 502 status codes happen for a Server Error, sometimes
-            retry_if_result(lambda r: r.status_code == 502),
-        ),
-    )
-    def _request(self, url: str) -> requests.Response:
-        response = self.session.get(url)
-
-        if (
-            # GitHub returns inconsistent status codes between unauthenticated
-            # rate limit and authenticated rate limits. Handle both.
-            response.status_code == 429
-            or (self.anonymous and response.status_code == 403)
-        ):
-            raise RateLimited(response)
-
-        return response
-
-    def request(self, url) -> requests.Response:
-        """Repeatedly requests the given URL, cycling through credentials and sleeping
-        if necessary; until either a successful response or :exc:`MissingRateLimitReset`
-        """
-        # The following for/else loop handles rate limiting; if successful,
-        # it provides the rest of the function with a `response` object.
-        #
-        # If all tokens are rate-limited, we sleep until the reset time,
-        # then `continue` into another iteration of the outer while loop,
-        # attempting to get data from the same URL again.
-
-        while True:
-            max_attempts = len(self.credentials) if self.credentials else 1
-            reset_times: Dict[int, int] = {}  # token index -> time
-            for attempt in range(max_attempts):
-                try:
-                    return self._request(url)
-                except RateLimited as e:
-                    reset_info = "(unknown reset)"
-                    if e.reset_time is not None:
-                        reset_times[self.token_index] = e.reset_time
-                        reset_info = "(resetting in %ss)" % (e.reset_time - time.time())
-
-                    if not self.anonymous:
-                        logger.info(
-                            "Rate limit exhausted for current user %s %s",
-                            self.current_user,
-                            reset_info,
-                        )
-                        # Use next token in line
-                        self.set_next_session_token()
-                        # Wait one second to avoid triggering GitHub's abuse rate limits
-                        time.sleep(1)
-
-            # All tokens have been rate-limited. What do we do?
-
-            if not reset_times:
-                logger.warning(
-                    "No X-Ratelimit-Reset value found in responses for any token; "
-                    "Giving up."
-                )
-                raise MissingRateLimitReset()
-
-            sleep_time = max(reset_times.values()) - time.time() + 1
-            logger.info(
-                "Rate limits exhausted for all tokens. Sleeping for %f seconds.",
-                sleep_time,
-            )
-            time.sleep(sleep_time)
+from swh.core.github.utils import *  # noqa