From 92d494261f267d6552d9a24361fd612974f1c982 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" Date: Wed, 26 Oct 2022 17:23:40 +0200 Subject: [PATCH] lister: Make sure lister that requires github tokens can use it Deploying the nixguix lister, I realized that even though the credentials configuration is properly set for all listers, the listers actually requiring github origin canonicalization do not have access to the github credentials. It's lost during the constructor to only focus on the lister's credentials. Which currently translates to listers being rate-limited. This commit fixes it by pushing the self.github_session instantiation in the constructor when the lister explicitely requires the github session. Hence lifting the rate limit for maven, packagist, nixguix, and github listers. Related to infra/sysadm-environment#4655 --- swh/lister/github/lister.py | 9 +++------ swh/lister/github/tests/test_lister.py | 2 +- swh/lister/maven/lister.py | 7 ++----- swh/lister/nixguix/lister.py | 12 +----------- swh/lister/packagist/lister.py | 7 ++----- swh/lister/pattern.py | 11 +++++++++++ 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index ae10d71..5728727 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -11,7 +11,7 @@ from urllib.parse import parse_qs, urlparse import iso8601 -from swh.core.github.utils import GitHubSession, MissingRateLimitReset +from swh.core.github.utils import MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -78,6 +78,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): credentials=credentials, url=self.API_URL, instance="github", + with_github_session=True, ) self.first_id = first_id @@ -85,11 +86,6 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): self.relisting = self.first_id is not None or self.last_id is not None - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) - def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: return GitHubListerState(**d) @@ -109,6 +105,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): logger.debug("Getting page %s", current_url) try: + assert self.github_session is not None response = self.github_session.request(current_url) except MissingRateLimitReset: # Give up diff --git a/swh/lister/github/tests/test_lister.py b/swh/lister/github/tests/test_lister.py index 88c5bf4..a09d606 100644 --- a/swh/lister/github/tests/test_lister.py +++ b/swh/lister/github/tests/test_lister.py @@ -141,7 +141,7 @@ def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> Non caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler) - assert lister.github_session.anonymous + assert lister.github_session is not None and lister.github_session.anonymous assert "using anonymous mode" in caplog.records[-1].message caplog.clear() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 2055b91..195a8a3 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -14,7 +14,6 @@ from bs4 import BeautifulSoup import lxml import requests -from swh.core.github.utils import GitHubSession from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -88,15 +87,12 @@ class MavenLister(Lister[MavenListerState, RepoPage]): credentials=credentials, url=url, instance=instance, + with_github_session=True, ) self.session.headers.update({"Accept": "application/json"}) self.jar_origins: Dict[str, ListedOrigin] = {} - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: return MavenListerState(**d) @@ -294,6 +290,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]): return None if url and visit_type == "git": + assert self.github_session is not None # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 0b8e8be..9ebe82e 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -29,7 +29,6 @@ from urllib.parse import parse_qsl, urlparse import requests from requests.exceptions import ConnectionError, InvalidSchema, SSLError -from swh.core.github.utils import GitHubSession from swh.core.tarball import MIMETYPE_TO_ARCHIVE_FORMAT from swh.lister import TARBALL_EXTENSIONS from swh.lister.pattern import CredentialsType, StatelessLister @@ -331,6 +330,7 @@ class NixGuixLister(StatelessLister[PageResult]): url=url.rstrip("/"), instance=instance, credentials=credentials, + with_github_session=canonicalize, ) # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? @@ -338,16 +338,6 @@ class NixGuixLister(StatelessLister[PageResult]): self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore self.session = requests.Session() - # for testing purposes, we may want to skip this step (e.g. docker run and rate - # limit) - self.github_session = ( - GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) - if canonicalize - else None - ) def build_artifact( self, artifact_url: str, artifact_type: str, artifact_ref: Optional[str] = None diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 251c25a..e9fa296 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -11,7 +11,6 @@ from typing import Any, Dict, Iterator, List, Optional import iso8601 import requests -from swh.core.github.utils import GitHubSession from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -60,14 +59,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): url=self.PACKAGIST_PACKAGES_LIST_URL, instance="packagist", credentials=credentials, + with_github_session=True, ) self.session.headers.update({"Accept": "application/json"}) self.listing_date = datetime.now().astimezone(tz=timezone.utc) - self.github_session = GitHubSession( - credentials=self.credentials, - user_agent=str(self.session.headers["User-Agent"]), - ) def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: last_listing_date = d.get("last_listing_date") @@ -152,6 +148,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): if visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical # ones + assert self.github_session is not None origin_url = ( self.github_session.get_canonical_url(origin_url) or origin_url ) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 7492683..5b3a33d 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -14,6 +14,7 @@ import requests from tenacity.before_sleep import before_sleep_log from swh.core.config import load_from_envvar +from swh.core.github.utils import GitHubSession from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface @@ -93,6 +94,7 @@ class Lister(Generic[StateType, PageType]): """ LISTER_NAME: str = "" + github_session: Optional[GitHubSession] = None def __init__( self, @@ -100,6 +102,7 @@ class Lister(Generic[StateType, PageType]): url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + with_github_session: bool = False, ): if not self.LISTER_NAME: raise ValueError("Must set the LISTER_NAME attribute on Lister classes") @@ -127,6 +130,14 @@ class Lister(Generic[StateType, PageType]): self.session.headers.update( {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} ) + self.github_session: Optional[GitHubSession] = ( + GitHubSession( + credentials=credentials.get("github", {}).get("github", []), + user_agent=str(self.session.headers["User-Agent"]), + ) + if with_github_session + else None + ) self.recorded_origins: Set[str] = set()