gitea: Inherit from Gogs lister
This removes code and adds support for incremental pagination. While both are essentially the same lister now, it still makes sense to keep the Gitea lister separate, in order to: 1. display them in different categories on https://archive.softwareheritage.org/ 2. support possible divergence of APIs in the future
This commit is contained in:
parent
dde7865ac4
commit
17a219ece0
3 changed files with 32 additions and 132 deletions
|
@ -4,27 +4,13 @@
|
|||
# See top-level LICENSE file for more information
|
||||
|
||||
import logging
|
||||
import random
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
from ..gogs.lister import GogsLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RepoListPage = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class GiteaLister(StatelessLister[RepoListPage]):
|
||||
class GiteaLister(GogsLister):
|
||||
"""List origins from Gitea.
|
||||
|
||||
Gitea API documentation: https://try.gitea.io/api/swagger
|
||||
|
@ -35,108 +21,7 @@ class GiteaLister(StatelessLister[RepoListPage]):
|
|||
|
||||
LISTER_NAME = "gitea"
|
||||
|
||||
REPO_LIST_PATH = "repos/search"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
url: str,
|
||||
instance: Optional[str] = None,
|
||||
api_token: Optional[str] = None,
|
||||
page_size: int = 50,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=credentials,
|
||||
url=url,
|
||||
instance=instance,
|
||||
def on_anonymous_mode(self):
|
||||
logger.warning(
|
||||
"No authentication token set in configuration, using anonymous mode"
|
||||
)
|
||||
|
||||
self.query_params = {
|
||||
"sort": "id",
|
||||
"order": "asc",
|
||||
"limit": page_size,
|
||||
"page": 1,
|
||||
}
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
|
||||
if api_token is None:
|
||||
if len(self.credentials) > 0:
|
||||
cred = random.choice(self.credentials)
|
||||
username = cred.get("username")
|
||||
api_token = cred["password"]
|
||||
logger.warning(
|
||||
"Using authentication token from user %s", username or "???"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"No authentication token set in configuration, using anonymous mode"
|
||||
)
|
||||
|
||||
if api_token:
|
||||
self.session.headers["Authorization"] = "Token %s" % api_token
|
||||
|
||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
|
||||
|
||||
logger.info("Fetching URL %s with params %s", url, params)
|
||||
|
||||
response = self.session.get(url, params=params)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s on %s: %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
response.content,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
@classmethod
|
||||
def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage:
|
||||
fields_filter = ["id", "clone_url", "updated_at"]
|
||||
return [{k: r[k] for k in fields_filter} for r in body["data"]]
|
||||
|
||||
def get_pages(self) -> Iterator[RepoListPage]:
|
||||
# base with trailing slash, path without leading slash for urljoin
|
||||
url: str = urljoin(self.url, self.REPO_LIST_PATH)
|
||||
|
||||
response = self.page_request(url, self.query_params)
|
||||
|
||||
while True:
|
||||
page_results = self.results_simplified(response.json())
|
||||
|
||||
yield page_results
|
||||
|
||||
assert len(response.links) > 0, "API changed: no Link header found"
|
||||
if "next" in response.links:
|
||||
url = response.links["next"]["url"]
|
||||
else:
|
||||
# last page
|
||||
break
|
||||
|
||||
response = self.page_request(url, {})
|
||||
|
||||
def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of Gitea repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for repo in page:
|
||||
last_update = iso8601.parse_date(repo["updated_at"])
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=repo["clone_url"],
|
||||
visit_type="git",
|
||||
last_update=last_update,
|
||||
)
|
||||
|
|
|
@ -10,33 +10,40 @@ from typing import Dict, List, Tuple
|
|||
import pytest
|
||||
import requests
|
||||
|
||||
from swh.lister.gitea.lister import GiteaLister, RepoListPage
|
||||
from swh.lister.gitea.lister import GiteaLister
|
||||
from swh.lister.gogs.lister import GogsListerPage
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
TRYGITEA_URL = "https://try.gitea.io/api/v1/"
|
||||
TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=1"
|
||||
TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?sort=id&order=asc&limit=3&page=2"
|
||||
TRYGITEA_P1_URL = TRYGITEA_URL + "repos/search?limit=3&page=1"
|
||||
TRYGITEA_P2_URL = TRYGITEA_URL + "repos/search?limit=3&page=2"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]:
|
||||
def trygitea_p1(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]:
|
||||
text = Path(datadir, "https_try.gitea.io", "repos_page1").read_text()
|
||||
headers = {
|
||||
"Link": '<{p2}>; rel="next",<{p2}>; rel="last"'.format(p2=TRYGITEA_P2_URL)
|
||||
}
|
||||
page_result = GiteaLister.results_simplified(json.loads(text))
|
||||
origin_urls = [r["clone_url"] for r in page_result]
|
||||
page_data = json.loads(text)
|
||||
page_result = GogsListerPage(
|
||||
repos=GiteaLister.extract_repos(page_data), next_link=TRYGITEA_P2_URL
|
||||
)
|
||||
origin_urls = [r["clone_url"] for r in page_data["data"]]
|
||||
return text, headers, page_result, origin_urls
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], RepoListPage, List[str]]:
|
||||
def trygitea_p2(datadir) -> Tuple[str, Dict[str, str], GogsListerPage, List[str]]:
|
||||
text = Path(datadir, "https_try.gitea.io", "repos_page2").read_text()
|
||||
headers = {
|
||||
"Link": '<{p1}>; rel="prev",<{p1}>; rel="first"'.format(p1=TRYGITEA_P1_URL)
|
||||
}
|
||||
page_result = GiteaLister.results_simplified(json.loads(text))
|
||||
origin_urls = [r["clone_url"] for r in page_result]
|
||||
page_data = json.loads(text)
|
||||
page_result = GogsListerPage(
|
||||
repos=GiteaLister.extract_repos(page_data), next_link=None
|
||||
)
|
||||
origin_urls = [r["clone_url"] for r in page_data["data"]]
|
||||
return text, headers, page_result, origin_urls
|
||||
|
||||
|
||||
|
@ -93,7 +100,9 @@ def test_gitea_full_listing(
|
|||
|
||||
check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins)
|
||||
|
||||
assert lister.get_state_from_scheduler() is None
|
||||
lister_state = lister.get_state_from_scheduler()
|
||||
assert lister_state.last_seen_next_link == TRYGITEA_P2_URL
|
||||
assert lister_state.last_seen_repo_id == p2_result.repos[-1]["id"]
|
||||
|
||||
|
||||
def test_gitea_auth_instance(swh_scheduler, requests_mock, trygitea_p1):
|
||||
|
|
|
@ -96,7 +96,8 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
"Using authentication credentials from user %s", username or "???"
|
||||
)
|
||||
else:
|
||||
raise ValueError("No credentials or API token provided")
|
||||
# Raises an error on Gogs, or a warning on Gitea
|
||||
self.on_anonymous_mode()
|
||||
|
||||
self.max_page_limit = 2
|
||||
|
||||
|
@ -105,10 +106,15 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
|
|||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Authorization": f"token {self.api_token}",
|
||||
}
|
||||
)
|
||||
|
||||
if self.api_token:
|
||||
self.session.headers["Authorization"] = f"token {self.api_token}"
|
||||
|
||||
def on_anonymous_mode(self):
|
||||
raise ValueError("No credentials or API token provided")
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> GogsListerState:
|
||||
return GogsListerState(**d)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue