lister: Make sure lister that requires github tokens can use it
Deploying the nixguix lister, I realized that even though the credentials configuration is properly set for all listers, the listers actually requiring github origin canonicalization do not have access to the github credentials. It's lost during the constructor to only focus on the lister's credentials. Which currently translates to listers being rate-limited. This commit fixes it by pushing the self.github_session instantiation in the constructor when the lister explicitely requires the github session. Hence lifting the rate limit for maven, packagist, nixguix, and github listers. Related to infra/sysadm-environment#4655
This commit is contained in:
parent
81688ca17e
commit
92d494261f
6 changed files with 20 additions and 28 deletions
|
@ -11,7 +11,7 @@ from urllib.parse import parse_qs, urlparse
|
|||
|
||||
import iso8601
|
||||
|
||||
from swh.core.github.utils import GitHubSession, MissingRateLimitReset
|
||||
from swh.core.github.utils import MissingRateLimitReset
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -78,6 +78,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
|
|||
credentials=credentials,
|
||||
url=self.API_URL,
|
||||
instance="github",
|
||||
with_github_session=True,
|
||||
)
|
||||
|
||||
self.first_id = first_id
|
||||
|
@ -85,11 +86,6 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
|
|||
|
||||
self.relisting = self.first_id is not None or self.last_id is not None
|
||||
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
|
||||
return GitHubListerState(**d)
|
||||
|
||||
|
@ -109,6 +105,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
|
|||
logger.debug("Getting page %s", current_url)
|
||||
|
||||
try:
|
||||
assert self.github_session is not None
|
||||
response = self.github_session.request(current_url)
|
||||
except MissingRateLimitReset:
|
||||
# Give up
|
||||
|
|
|
@ -141,7 +141,7 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non
|
|||
caplog.set_level(logging.DEBUG, "swh.core.github.utils")
|
||||
|
||||
lister = GitHubLister(scheduler=swh_scheduler)
|
||||
assert lister.github_session.anonymous
|
||||
assert lister.github_session is not None and lister.github_session.anonymous
|
||||
assert "using anonymous mode" in caplog.records[-1].message
|
||||
caplog.clear()
|
||||
|
||||
|
|
|
@ -14,7 +14,6 @@ from bs4 import BeautifulSoup
|
|||
import lxml
|
||||
import requests
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -88,15 +87,12 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
credentials=credentials,
|
||||
url=url,
|
||||
instance=instance,
|
||||
with_github_session=True,
|
||||
)
|
||||
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
self.jar_origins: Dict[str, ListedOrigin] = {}
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
|
||||
return MavenListerState(**d)
|
||||
|
@ -294,6 +290,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
return None
|
||||
|
||||
if url and visit_type == "git":
|
||||
assert self.github_session is not None
|
||||
# Non-github urls will be returned as is, github ones will be canonical ones
|
||||
url = self.github_session.get_canonical_url(url)
|
||||
|
||||
|
|
|
@ -29,7 +29,6 @@ from urllib.parse import parse_qsl, urlparse
|
|||
import requests
|
||||
from requests.exceptions import ConnectionError, InvalidSchema, SSLError
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT
|
||||
from swh.lister import TARBALL_EXTENSIONS
|
||||
from swh.lister.pattern import CredentialsType, StatelessLister
|
||||
|
@ -331,6 +330,7 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
url=url.rstrip("/"),
|
||||
instance=instance,
|
||||
credentials=credentials,
|
||||
with_github_session=canonicalize,
|
||||
)
|
||||
# either full fqdn NixOS/nixpkgs or guix repository urls
|
||||
# maybe add an assert on those specific urls?
|
||||
|
@ -338,16 +338,6 @@ class NixGuixLister(StatelessLister[PageResult]):
|
|||
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
|
||||
|
||||
self.session = requests.Session()
|
||||
# for testing purposes, we may want to skip this step (e.g. docker run and rate
|
||||
# limit)
|
||||
self.github_session = (
|
||||
GitHubSession(
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
if canonicalize
|
||||
else None
|
||||
)
|
||||
|
||||
def build_artifact(
|
||||
self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None
|
||||
|
|
|
@ -11,7 +11,6 @@ from typing import Any, Dict, Iterator, List, Optional
|
|||
import iso8601
|
||||
import requests
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
|
@ -60,14 +59,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
url=self.PACKAGIST_PACKAGES_LIST_URL,
|
||||
instance="packagist",
|
||||
credentials=credentials,
|
||||
with_github_session=True,
|
||||
)
|
||||
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
|
||||
last_listing_date = d.get("last_listing_date")
|
||||
|
@ -152,6 +148,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
|
|||
if visit_type == "git":
|
||||
# Non-github urls will be returned as is, github ones will be canonical
|
||||
# ones
|
||||
assert self.github_session is not None
|
||||
origin_url = (
|
||||
self.github_session.get_canonical_url(origin_url) or origin_url
|
||||
)
|
||||
|
|
|
@ -14,6 +14,7 @@ import requests
|
|||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.core.config import load_from_envvar
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.core.utils import grouper
|
||||
from swh.scheduler import get_scheduler, model
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -93,6 +94,7 @@ class Lister(Generic[StateType, PageType]):
|
|||
"""
|
||||
|
||||
LISTER_NAME: str = ""
|
||||
github_session: Optional[GitHubSession] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -100,6 +102,7 @@ class Lister(Generic[StateType, PageType]):
|
|||
url: str,
|
||||
instance: Optional[str] = None,
|
||||
credentials: CredentialsType = None,
|
||||
with_github_session: bool = False,
|
||||
):
|
||||
if not self.LISTER_NAME:
|
||||
raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
|
||||
|
@ -127,6 +130,14 @@ class Lister(Generic[StateType, PageType]):
|
|||
self.session.headers.update(
|
||||
{"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME}
|
||||
)
|
||||
self.github_session: Optional[GitHubSession] = (
|
||||
GitHubSession(
|
||||
credentials=credentials.get("github", {}).get("github", []),
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
if with_github_session
|
||||
else None
|
||||
)
|
||||
|
||||
self.recorded_origins: Set[str] = set()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue