Refactor and deduplicate HTTP requests code in listers

Numerous listers were using the same page_request method or equivalent
in their implementation so prefer to deduplicate that code by adding
an http_request method in base lister class: swh.lister.pattern.Lister.

That method simply wraps a call to requests.Session.request and logs
some useful info for debugging and error reporting, also an HTTPError
will be raised if a request ends up with an error.

All listers using that new method now benefit of requests retry when
an HTTP error occurs thanks to the use of the http_retry decorator.
This commit is contained in:
Antoine Lambert 2022-09-21 19:53:22 +02:00
parent 9c55acd286
commit db6ce12e9e
28 changed files with 174 additions and 449 deletions

View file

@ -1,4 +1,4 @@
# Copyright (C) 2021 The Software Heritage developers
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -8,14 +8,9 @@ import logging
from typing import Any, Dict, Iterator, List
from urllib.parse import urljoin
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -65,11 +60,7 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
instance=instance,
)
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState:
return NewForgeListerState(**d)
@ -77,32 +68,6 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
return asdict(state)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
# Do the network resource request under a retrying decorator
# to handle rate limiting and transient errors up to a limit.
# `http_retry` by default use the `requests` library to check
# only for rate-limit and a base-10 exponential waiting strategy.
# This can be customized by passed waiting, retrying and logging strategies
# as functions. See the `tenacity` library documentation.
# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
# The lister must fail on blocking errors
response.raise_for_status()
return response
def get_pages(self) -> Iterator[NewForgeListerPage]:
# The algorithm depends on the service, but should request data reliably,
# following pagination if relevant and yielding pages in a streaming fashion.
@ -120,7 +85,7 @@ class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
while current is not None:
# Parametrize the request for incremental listing
body = self.page_request(url, {"current": current}).json()
body = self.http_request(url, params={"current": current}).json()
# Simplify the page if possible to only the necessary elements
# and yield it

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,8 +7,6 @@ import logging
import pkg_resources
from swh.lister import pattern
logger = logging.getLogger(__name__)
@ -53,6 +51,9 @@ def get_lister(lister_name, db_url=None, **conf):
registry_entry = LISTERS[lister_name].load()()
lister_cls = registry_entry["lister"]
from swh.lister import pattern
if issubclass(lister_cls, pattern.Lister):
return lister_cls.from_config(**conf)
else:

View file

@ -13,15 +13,11 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import unquote, urljoin
from bs4 import BeautifulSoup
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@ -125,29 +121,6 @@ class ArchLister(StatelessLister[ArchListerPage]):
)
self.flavours = flavours
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": USER_AGENT,
}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def scrap_package_versions(
self, name: str, repo: str, base_url: str
@ -179,7 +152,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url
)
response = self.request_get(url=url, params={})
response = self.http_request(url)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
@ -263,7 +236,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
Returns:
a directory Path where the archive has been extracted to.
"""
res = self.request_get(url=url, params={})
res = self.http_request(url)
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)

View file

@ -7,8 +7,6 @@ import datetime
import logging
from typing import Any, Dict, Iterator, List, Optional
import requests
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -65,7 +63,7 @@ class AurLister(StatelessLister[AurListerPage]):
a directory Path where the archive has been downloaded to.
"""
url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
return requests.get(url).json()
return self.http_request(url).json()
def get_pages(self) -> Iterator[AurListerPage]:
"""Yield an iterator which returns 'page'

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -11,14 +11,10 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib import parse
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -77,10 +73,7 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
),
}
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
if len(self.credentials) > 0:
cred = random.choice(self.credentials)
@ -107,25 +100,6 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
if username is not None and password is not None:
self.session.auth = (username, password)
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def page_request(self, last_repo_cdate: str) -> requests.Response:
self.url_params["after"] = last_repo_cdate
logger.debug("Fetching URL %s with params %s", self.url, self.url_params)
response = self.session.get(self.url, params=self.url_params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
last_repo_cdate: str = "1970-01-01"
@ -137,7 +111,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
last_repo_cdate = self.state.last_repo_cdate.isoformat()
while True:
body = self.page_request(last_repo_cdate).json()
self.url_params["after"] = last_repo_cdate
body = self.http_request(self.url, params=self.url_params).json()
yield body["values"]

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -69,14 +69,16 @@ def test_bitbucket_incremental_lister(
assert last_repo_cdate == bb_api_repositories_page2["values"][-1]["created_on"]
# Second listing, restarting from last state
lister.session.get = mocker.spy(lister.session, "get")
lister.session.request = mocker.spy(lister.session, "request")
lister.run()
url_params = lister.url_params
url_params["after"] = last_repo_cdate
lister.session.get.assert_called_once_with(lister.API_URL, params=url_params)
lister.session.request.assert_called_once_with(
"GET", lister.API_URL, params=url_params
)
all_origins = (
bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"]
@ -106,7 +108,7 @@ def test_bitbucket_lister_rate_limit_hit(
lister = BitbucketLister(scheduler=swh_scheduler, page_size=10)
mocker.patch.object(lister.page_request.retry, "sleep")
mocker.patch.object(lister.http_request.retry, "sleep")
stats = lister.run()

View file

@ -2,17 +2,13 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
from typing import Dict, Iterator, List, Optional
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@ -41,30 +37,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
instance=self.INSTANCE,
url=self.API_URL,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[BowerListerPage]:
"""Yield an iterator which returns 'page'
@ -75,7 +48,7 @@ class BowerLister(StatelessLister[BowerListerPage]):
There is only one page that list all origins urls.
"""
response = self.page_request(url=self.url, params={})
response = self.http_request(self.url)
yield response.json()
def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]:

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2022 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -9,13 +9,9 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -73,17 +69,12 @@ class CGitLister(StatelessLister[Repositories]):
credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/html"})
self.base_git_url = base_git_url
@http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG))
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)
response.raise_for_status()
response = self.http_request(url)
return BeautifulSoup(response.text, features="html.parser")
def get_pages(self) -> Iterator[Repositories]:

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2022 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -258,7 +258,7 @@ def test_lister_cgit_get_pages_with_pages_and_retry(
lister_cgit = CGitLister(swh_scheduler, url=url)
mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep")
mocker.patch.object(lister_cgit.http_request.retry, "sleep")
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])

View file

@ -1,9 +1,8 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import bz2
from collections import defaultdict
from dataclasses import dataclass, field
@ -17,12 +16,11 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple
from urllib.parse import urljoin
from debian.deb822 import Sources
import requests
from requests.exceptions import HTTPError
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -95,9 +93,6 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
self.suites = suites
self.components = components
self.session = requests.Session()
self.session.headers.update({"User-Agent": USER_AGENT})
# will hold all listed origins info
self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {}
# will contain origin urls that have already been listed
@ -132,9 +127,11 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
def page_request(self, suite: Suite, component: Component) -> DebianPageType:
"""Return parsed package Sources file for a given debian suite and component."""
for url, compression in self.debian_index_urls(suite, component):
response = requests.get(url, stream=True)
logging.debug("Fetched URL: %s, status code: %s", url, response.status_code)
if response.status_code == 200:
try:
response = self.http_request(url, stream=True)
except HTTPError:
pass
else:
last_modified = response.headers.get("Last-Modified")
self.last_sources_update = (
parsedate_to_datetime(last_modified) if last_modified else None

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -10,12 +10,10 @@ from typing import Any, Dict, Iterator, Optional, Tuple
from urllib.parse import parse_qs, urlencode, urlparse
import iso8601
import requests
from requests.exceptions import HTTPError
from requests.status_codes import codes
from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
from swh.lister.utils import http_retry, is_retryable_exception
from swh.scheduler.model import ListedOrigin
@ -118,10 +116,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
self.last_page: Optional[str] = None
self.per_page = 100
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
if len(self.credentials) > 0:
cred = random.choice(self.credentials)

View file

@ -2,6 +2,7 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import logging
import random
@ -9,14 +10,11 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from requests.exceptions import HTTPError
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -96,13 +94,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
# Raises an error on Gogs, or a warning on Gitea
self.on_anonymous_mode()
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
self.session.headers.update({"Accept": "application/json"})
if self.api_token:
self.session.headers["Authorization"] = f"token {self.api_token}"
@ -116,34 +108,27 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]:
return asdict(state)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(
self, url: str, params: Dict[str, Any]
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
if (
response.status_code == 500
): # Temporary hack for skipping fatal repos (T4423)
url_parts = urlparse(url)
query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
query.update({"page": _parse_page_id(url) + 1})
next_page_link = url_parts._replace(query=urlencode(query)).geturl()
body: Dict[str, Any] = {"data": []}
links = {"next": {"url": next_page_link}}
return body, links
else:
response.raise_for_status()
try:
response = self.http_request(url, params=params)
except HTTPError as http_error:
if (
http_error.response.status_code == 500
): # Temporary hack for skipping fatal repos (T4423)
url_parts = urlparse(url)
query: Dict[str, Any] = dict(parse_qsl(url_parts.query))
query.update({"page": _parse_page_id(url) + 1})
next_page_link = url_parts._replace(query=urlencode(query)).geturl()
body: Dict[str, Any] = {"data": []}
links = {"next": {"url": next_page_link}}
return body, links
else:
raise
return response.json(), response.links
@ -159,6 +144,7 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]):
# base with trailing slash, path without leading slash for urljoin
next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH)
assert next_link is not None
body, links = self.page_request(
next_link, {**self.query_params, "page": page_id}

View file

@ -264,7 +264,7 @@ def test_gogs_incremental_lister(
p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last
requests_mock.get(P3, text=p3_text, headers=p3_headers)
lister.session.get = mocker.spy(lister.session, "get")
lister.session.request = mocker.spy(lister.session, "request")
attempt2_stats = lister.run()
@ -277,8 +277,8 @@ def test_gogs_incremental_lister(
query_params = lister.query_params
query_params["page"] = page_id
lister.session.get.assert_called_once_with(
TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
lister.session.request.assert_called_once_with(
"GET", TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params
)
# All the 9 origins (3 pages) should be passed on to the scheduler:

View file

@ -10,14 +10,10 @@ import logging
from typing import Any, Dict, Iterator, List, Optional, Tuple
import iso8601
import requests
from tenacity import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -59,10 +55,7 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
self.incremental = incremental
def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType:
@ -87,24 +80,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
):
self.updated = True
@http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def api_request(self, url: str) -> List[str]:
logger.debug("Fetching URL %s", url)
response = self.session.get(url)
if response.status_code not in (200, 304):
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s for URL %s",
response.status_code,
response.url,
)
response.raise_for_status()
response = self.http_request(url)
return response.text.split()
def get_single_page(

View file

@ -98,11 +98,12 @@ def _generate_responses(datadir, requests_mock):
def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
# first listing, should return one origin per package
lister = GolangLister(scheduler=swh_scheduler)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
mocked_sleep = mocker.patch.object(GolangLister.http_request.retry, "sleep")
# first listing, should return one origin per package
lister = GolangLister(scheduler=swh_scheduler)
_generate_responses(datadir, requests_mock)
@ -131,7 +132,7 @@ def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
# doing it all again (without incremental) should give us the same result
lister = GolangLister(scheduler=swh_scheduler)
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
_generate_responses(datadir, requests_mock)
stats = lister.run()

View file

@ -13,10 +13,8 @@ from urllib.parse import urljoin
from bs4 import BeautifulSoup
import lxml
import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.github.utils import GitHubSession
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -93,13 +91,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
instance=instance,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
self.session.headers.update({"Accept": "application/json"})
self.jar_origins: Dict[str, ListedOrigin] = {}
self.github_session = GitHubSession(
@ -112,23 +104,6 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
return asdict(state)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[RepoPage]:
"""Retrieve and parse exported maven indexes to
identify all pom files and src archives.
@ -155,10 +130,11 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
# Download the main text index file.
logger.info("Downloading computed index from %s.", self.INDEX_URL)
assert self.INDEX_URL is not None
response = requests.get(self.INDEX_URL, stream=True)
if response.status_code != 200:
try:
response = self.http_request(self.INDEX_URL, stream=True)
except requests.HTTPError:
logger.error("Index %s not found, stopping", self.INDEX_URL)
response.raise_for_status()
raise
# Prepare regexes to parse index exports.
@ -250,9 +226,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
# Now fetch pom files and scan them for scm info.
logger.info("Fetching poms..")
for pom in out_pom:
for pom_url in out_pom:
try:
response = self.page_request(pom, {})
response = self.http_request(pom_url)
parsed_pom = BeautifulSoup(response.content, "xml")
project = parsed_pom.find("project")
if project is None:
@ -263,22 +239,24 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
if connection is not None:
artifact_metadata_d = {
"type": "scm",
"doc": out_pom[pom],
"doc": out_pom[pom_url],
"url": connection.text,
}
logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
logger.debug(
"* Yielding pom %s: %s", pom_url, artifact_metadata_d
)
yield artifact_metadata_d
else:
logger.debug("No scm.connection in pom %s", pom)
logger.debug("No scm.connection in pom %s", pom_url)
else:
logger.debug("No scm in pom %s", pom)
logger.debug("No scm in pom %s", pom_url)
except requests.HTTPError:
logger.warning(
"POM info page could not be fetched, skipping project '%s'",
pom,
pom_url,
)
except lxml.etree.Error as error:
logger.info("Could not parse POM %s XML: %s.", pom, error)
logger.info("Could not parse POM %s XML: %s.", pom_url, error)
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
"""Retrieve scm origin out of the page information. Only called when type of the

View file

@ -127,7 +127,7 @@ def network_requests_mock(
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(MavenLister.page_request.retry, "sleep")
mocker.patch.object(MavenLister.http_request.retry, "sleep")
def test_maven_full_listing(swh_scheduler):

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2021 the Software Heritage developers
# Copyright (C) 2018-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -7,12 +7,8 @@ import logging
from typing import Any, Dict, Iterator, List, Optional
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -75,10 +71,7 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
self.page_size += 1
self.incremental = incremental
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState:
return NpmListerState(**d)
@ -95,21 +88,6 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
params["startkey"] = last_package_id
return params
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, last_package_id: str) -> requests.Response:
params = self.request_params(last_package_id)
logger.debug("Fetching URL %s with params %s", self.url, params)
response = self.session.get(self.url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
last_package_id: str = "0" if self.incremental else '""'
if (
@ -121,7 +99,9 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
while True:
response = self.page_request(last_package_id)
response = self.http_request(
self.url, params=self.request_params(last_package_id)
)
data = response.json()
page = data["results"] if self.incremental else data["rows"]

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -37,7 +37,7 @@ def npm_incremental_listing_page2(datadir):
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(NpmLister.page_request.retry, "sleep")
mocker.patch.object(NpmLister.http_request.retry, "sleep")
def _check_listed_npm_packages(lister, packages, scheduler_origins):
@ -78,19 +78,21 @@ def test_npm_lister_full(
additional_matcher=_match_request,
)
spy_get = mocker.spy(lister.session, "get")
spy_request = mocker.spy(lister.session, "request")
stats = lister.run()
assert stats.pages == 2
assert stats.origins == page_size * stats.pages
spy_get.assert_has_calls(
spy_request.assert_has_calls(
[
mocker.call(
"GET",
lister.API_FULL_LISTING_URL,
params=_url_params(page_size + 1, startkey='""'),
),
mocker.call(
"GET",
lister.API_FULL_LISTING_URL,
params=_url_params(
page_size + 1,
@ -132,7 +134,7 @@ def test_npm_lister_incremental(
additional_matcher=_match_request,
)
spy_get = mocker.spy(lister.session, "get")
spy_request = mocker.spy(lister.session, "request")
assert lister.get_state_from_scheduler() == NpmListerState()
@ -142,13 +144,15 @@ def test_npm_lister_incremental(
last_seq = npm_incremental_listing_page2["results"][-1]["seq"]
spy_get.assert_has_calls(
spy_request.assert_has_calls(
[
mocker.call(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since="0"),
),
mocker.call(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(
page_size,
@ -156,6 +160,7 @@ def test_npm_lister_incremental(
),
),
mocker.call(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since=str(last_seq)),
),
@ -189,11 +194,12 @@ def test_npm_lister_incremental_restart(
requests_mock.get(lister.API_INCREMENTAL_LISTING_URL, json={"results": []})
spy_get = mocker.spy(lister.session, "get")
spy_request = mocker.spy(lister.session, "request")
lister.run()
spy_get.assert_called_with(
spy_request.assert_called_with(
"GET",
lister.API_INCREMENTAL_LISTING_URL,
params=_url_params(page_size, since=str(last_seq)),
)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -14,7 +14,6 @@ import requests
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -62,10 +61,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
@ -82,20 +78,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
return d
def api_request(self, url: str) -> Any:
logger.debug("Fetching URL %s", url)
response = self.session.get(url)
if response.status_code not in (200, 304):
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
response = self.http_request(url)
# response is empty when status code is 304
return response.json() if response.status_code == 200 else {}
@ -134,7 +117,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
# missing package metadata in response
continue
versions_info = metadata["packages"][package_name].values()
except requests.exceptions.HTTPError:
except requests.HTTPError:
# error when getting package metadata (usually 404 when a
# package has been removed), skip it and process next package
continue

View file

@ -1,4 +1,4 @@
# Copyright (C) 2020-2021 The Software Heritage developers
# Copyright (C) 2020-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -6,14 +6,23 @@
from __future__ import annotations
from dataclasses import dataclass
import logging
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar
from urllib.parse import urlparse
import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.config import load_from_envvar
from swh.core.utils import grouper
from swh.scheduler import get_scheduler, model
from swh.scheduler.interface import SchedulerInterface
from . import USER_AGENT
from .utils import http_retry
logger = logging.getLogger(__name__)
@dataclass
class ListerStats:
@ -113,6 +122,27 @@ class Lister(Generic[StateType, PageType]):
self.state = self.get_state_from_scheduler()
self.updated = False
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update({"User-Agent": USER_AGENT})
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, kwargs.get("params"))
response = self.session.request(method, url, **kwargs)
if response.status_code not in (200, 304):
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def run(self) -> ListerStats:
"""Run the lister.

View file

@ -1,15 +1,13 @@
# Copyright (C) 2019-2021 the Software Heritage developers
# Copyright (C) 2019-2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
import logging
import random
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import requests
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -47,10 +45,7 @@ class PhabricatorLister(StatelessLister[PageType]):
scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
if api_token is not None:
self.api_token = api_token
@ -91,22 +86,7 @@ class PhabricatorLister(StatelessLister[PageType]):
after: Optional[str] = None
while True:
params = self.get_request_params(after)
logger.debug(
"Retrieving results on URI %s with parameters %s",
self.url,
self.filter_params(params),
)
response = self.session.post(self.url, data=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
response = self.http_request(self.url, method="POST", data=params)
response_data = response.json()

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -27,6 +27,11 @@ def phabricator_repositories_page2(datadir):
)
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(PhabricatorLister.http_request.retry, "sleep")
def test_get_repo_url(phabricator_repositories_page1):
repos = phabricator_repositories_page1["result"]["data"]
for repo in repos:

View file

@ -2,15 +2,13 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
from typing import Iterator, List, Optional
import iso8601
import requests
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -52,7 +50,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
instance=self.INSTANCE,
url=self.BASE_URL,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
@ -60,23 +58,6 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
def get_pages(self) -> Iterator[PubDevListerPage]:
"""Yield an iterator which returns 'page'
@ -88,8 +69,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
There is only one page that list all origins url based on "{base_url}packages/{pkgname}"
"""
response = self.page_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={}
response = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url)
)
yield response.json()["packages"]
@ -102,7 +83,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
base_url=self.url, pkgname=pkgname
)
try:
response = self.page_request(url=package_info_url, params={})
response = self.http_request(url=package_info_url)
except HTTPError:
logger.warning(
"Failed to fetch metadata for package %s, skipping it from listing.",

View file

@ -15,14 +15,11 @@ from bs4 import BeautifulSoup
import iso8601
import lxml
import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.api.classes import stream_results
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -127,11 +124,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
# Will hold the currently saved "last modified" dates to compare against our
# requests.
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.session.headers.update({"Accept": "application/json"})
self.incremental = incremental
def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
@ -208,26 +202,6 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
self._project_last_modified = listed_origins
return listed_origins
@http_retry(
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def page_request(self, url, params) -> requests.Response:
# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s for URL %s",
response.status_code,
response.url,
)
# The lister must fail on blocking errors
response.raise_for_status()
return response
def get_pages(self) -> Iterator[SourceForgeListerPage]:
"""
SourceForge has a main XML sitemap that lists its sharded sitemaps for all
@ -240,7 +214,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
Lastly we use the information of which VCS are used to build the predictable
clone URL for any given VCS.
"""
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text
tree = ElementTree.fromstring(sitemap_contents)
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
@ -259,7 +233,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
continue
self.state.subsitemap_last_modified[sub_url] = last_modified
subsitemap_contents = self.page_request(sub_url, {}).text
subsitemap_contents = self.http_request(sub_url).text
subtree = ElementTree.fromstring(subsitemap_contents)
yield from self._get_pages_from_subsitemap(subtree)
@ -351,9 +325,9 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
logger.debug(msg, namespace, project)
try:
res = self.page_request(endpoint, {}).json()
res = self.http_request(endpoint).json()
except requests.HTTPError:
# We've already logged in `page_request`
# We've already logged in `http_request`
return []
tools = res.get("tools")
@ -373,7 +347,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
# and multiple origin URLs can be produced for a same project.
cvs_info_url = f"http://{project}.cvs.sourceforge.net"
try:
response = self.page_request(cvs_info_url, params={})
response = self.http_request(cvs_info_url)
except requests.HTTPError:
logger.warning(
"CVS info page could not be fetched, skipping project '%s'",
@ -413,7 +387,7 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
# and a lot of them are 404 now.
url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"
try:
response = self.page_request(url, params={})
response = self.http_request(url)
if "To get this branch, use:" not in response.text:
# If a bzr project has multiple branches, we need to extract their
# names from the repository landing page and create one listed origin

View file

@ -1,7 +1,8 @@
# Copyright (C) 2021 The Software Heritage developers
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import functools
import json
@ -368,7 +369,7 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
lister = SourceForgeLister(scheduler=swh_scheduler)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep")
mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep")
requests_mock.get(
MAIN_SITEMAP_URL,
@ -438,7 +439,7 @@ def test_sourceforge_lister_http_error(
lister = SourceForgeLister(scheduler=swh_scheduler)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep")
mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep")
requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code)
@ -458,7 +459,7 @@ def test_sourceforge_lister_project_error(
):
lister = SourceForgeLister(scheduler=swh_scheduler)
# Exponential retries take a long time, so stub time.sleep
mocker.patch.object(lister.page_request.retry, "sleep")
mocker.patch.object(lister.http_request.retry, "sleep")
requests_mock.get(
MAIN_SITEMAP_URL,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2021 The Software Heritage developers
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -8,14 +8,10 @@ from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import http_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@ -57,30 +53,7 @@ class TuleapLister(StatelessLister[RepoPage]):
instance=instance,
)
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
logger.info("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
return response
self.session.headers.update({"Accept": "application/json"})
@classmethod
def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage:
@ -97,14 +70,14 @@ class TuleapLister(StatelessLister[RepoPage]):
return rep
def _get_repositories(self, url_repo) -> List[Dict[str, Any]]:
ret = self.page_request(url_repo, {})
ret = self.http_request(url_repo)
reps_list = ret.json()["repositories"]
limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"])
offset = int(ret.headers["X-PAGINATION-LIMIT"])
size = int(ret.headers["X-PAGINATION-SIZE"])
while offset < size:
url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit)
ret = self.page_request(url_offset, {}).json()
ret = self.http_request(url_offset).json()
reps_list = reps_list + ret["repositories"]
offset += limit
return reps_list
@ -115,7 +88,7 @@ class TuleapLister(StatelessLister[RepoPage]):
url_projects = url_api + "/projects/"
# Get the list of projects.
response = self.page_request(url_projects, {})
response = self.http_request(url_projects)
projects_list = response.json()
limit = int(response.headers["X-PAGINATION-LIMIT-MAX"])
offset = int(response.headers["X-PAGINATION-LIMIT"])
@ -124,7 +97,7 @@ class TuleapLister(StatelessLister[RepoPage]):
url_offset = (
url_projects + "?offset=" + str(offset) + "&limit=" + str(limit)
)
ret = self.page_request(url_offset, {}).json()
ret = self.http_request(url_offset).json()
projects_list = projects_list + ret
offset += limit

View file

@ -1,4 +1,4 @@
# Copyright (C) 2021 The Software Heritage developers
# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -94,7 +94,7 @@ def tuleap_repo_3(datadir) -> Tuple[str, Dict[str, str], List[RepoPage], List[st
@pytest.fixture(autouse=True)
def retry_sleep_mock(mocker):
mocker.patch.object(TuleapLister.page_request.retry, "sleep")
mocker.patch.object(TuleapLister.http_request.retry, "sleep")
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):