Update value of User-Agent HTTP request header used by listers

That HTTP header value will now contain the lister name but also a link
to our contact form in order for sysadmins to easily reach us if needed.

The following template is used to generate it:

"Software Heritage <lister_name> lister v<swh-lister version>
 (+https://www.softwareheritage.org/contact)"
This commit is contained in:
Antoine Lambert 2022-09-22 15:43:20 +02:00
parent db6ce12e9e
commit d5c30a3ce3
11 changed files with 38 additions and 36 deletions

View file

@ -15,9 +15,10 @@ try:
except pkg_resources.DistributionNotFound:
__version__ = "devel"
USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)"
USER_AGENT = USER_AGENT_TEMPLATE % __version__
USER_AGENT_TEMPLATE = (
f"Software Heritage %s lister v{__version__}"
" (+https://www.softwareheritage.org/contact)"
)
LISTERS = {
entry_point.name.split(".", 1)[1]: entry_point

View file

@ -69,7 +69,7 @@ def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert "Software Heritage cgit lister" in user_agent
assert __version__ in user_agent

View file

@ -15,7 +15,6 @@ from swh.core.github.utils import GitHubSession, MissingRateLimitReset
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -87,7 +86,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
self.relisting = self.first_id is not None or self.last_id is not None
self.github_session = GitHubSession(
credentials=self.credentials, user_agent=USER_AGENT
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:

View file

@ -1,8 +1,9 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import json
import logging
from pathlib import Path
@ -11,7 +12,7 @@ from typing import Dict, List
import pytest
from requests.status_codes import codes
from swh.lister import USER_AGENT
from swh.lister import USER_AGENT_TEMPLATE
from swh.lister.gitlab.lister import GitLabLister, _parse_id_after
from swh.lister.pattern import ListerStats
from swh.lister.tests.test_utils import assert_sleep_calls
@ -24,8 +25,8 @@ def api_url(instance: str) -> str:
return f"https://{instance}/api/v4/"
def _match_request(request):
return request.headers.get("User-Agent") == USER_AGENT
def _match_request(request, lister_name="gitlab"):
return request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % lister_name
def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
@ -70,7 +71,7 @@ def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock):
requests_mock.get(
lister.page_url(),
[{"json": response}],
additional_matcher=_match_request,
additional_matcher=functools.partial(_match_request, lister_name="heptapod"),
)
listed_result = lister.run()

View file

@ -18,7 +18,6 @@ from swh.core.github.utils import GitHubSession
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
@ -95,7 +94,8 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
self.jar_origins: Dict[str, ListedOrigin] = {}
self.github_session = GitHubSession(
credentials=self.credentials, user_agent=USER_AGENT
credentials=self.credentials,
user_agent=str(self.session.headers["User-Agent"]),
)
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:

View file

@ -11,7 +11,7 @@ import iso8601
import pytest
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
from swh.lister import USER_AGENT_TEMPLATE
from swh.lister.npm.lister import NpmLister, NpmListerState
@ -53,7 +53,9 @@ def _check_listed_npm_packages(lister, packages, scheduler_origins):
def _match_request(request):
return request.headers.get("User-Agent") == USER_AGENT
return (
request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % NpmLister.LISTER_NAME
)
def _url_params(page_size, **kwargs):

View file

@ -18,7 +18,7 @@ from swh.core.utils import grouper
from swh.scheduler import get_scheduler, model
from swh.scheduler.interface import SchedulerInterface
from . import USER_AGENT
from . import USER_AGENT_TEMPLATE
from .utils import http_retry
logger = logging.getLogger(__name__)
@ -124,7 +124,9 @@ class Lister(Generic[StateType, PageType]):
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update({"User-Agent": USER_AGENT})
self.session.headers.update(
{"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME}
)
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:

View file

@ -9,7 +9,7 @@ from pathlib import Path
import pytest
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
from swh.lister import USER_AGENT_TEMPLATE
from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url
@ -94,7 +94,8 @@ def test_lister(
def match_request(request):
return (
request.headers.get("User-Agent") == USER_AGENT
request.headers.get("User-Agent")
== USER_AGENT_TEMPLATE % PhabricatorLister.LISTER_NAME
and f"api.token={API_TOKEN}" in request.body
)

View file

@ -12,15 +12,8 @@ from requests.exceptions import HTTPError
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import __version__
from ..pattern import CredentialsType, StatelessLister
# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers
USER_AGENT = (
f"Software Heritage PubDev Lister v{__version__} "
"(+https://www.softwareheritage.org/contact)"
)
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
@ -51,12 +44,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
url=self.BASE_URL,
)
self.session.headers.update(
{
"Accept": "application/json",
"User-Agent": USER_AGENT,
}
)
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[PubDevListerPage]:
"""Yield an iterator which returns 'page'

View file

@ -3,7 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pubdev.lister import USER_AGENT, PubDevLister
from swh.lister import USER_AGENT_TEMPLATE
from swh.lister.pubdev.lister import PubDevLister
expected_origins = {
"https://pub.dev/packages/Autolinker",
@ -29,7 +30,10 @@ def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
def _match_request(request):
return request.headers.get("User-Agent") == USER_AGENT
return (
request.headers.get("User-Agent")
== USER_AGENT_TEMPLATE % PubDevLister.LISTER_NAME
)
def test_pubdev_lister_skip_package(

View file

@ -13,7 +13,7 @@ from iso8601 import iso8601
import pytest
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
from swh.lister import USER_AGENT_TEMPLATE
from swh.lister.sourceforge.lister import (
MAIN_SITEMAP_URL,
PROJECT_API_URL_FORMAT,
@ -75,7 +75,10 @@ def get_bzr_repo_page(datadir, repo_name):
def _check_request_headers(request):
return request.headers.get("User-Agent") == USER_AGENT
return (
request.headers.get("User-Agent")
== USER_AGENT_TEMPLATE % SourceForgeLister.LISTER_NAME
)
def _check_listed_origins(lister, swh_scheduler):