Update value of User-Agent HTTP request header used by listers
That HTTP header value will now contain the lister name but also a link to our contact form in order for sysadmins to easily reach us if needed. The following template is used to generate it: "Software Heritage <lister_name> lister v<swh-lister version> (+https://www.softwareheritage.org/contact)"
This commit is contained in:
parent
db6ce12e9e
commit
d5c30a3ce3
11 changed files with 38 additions and 36 deletions
|
@ -15,9 +15,10 @@ try:
|
|||
except pkg_resources.DistributionNotFound:
|
||||
__version__ = "devel"
|
||||
|
||||
USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)"
|
||||
USER_AGENT = USER_AGENT_TEMPLATE % __version__
|
||||
|
||||
USER_AGENT_TEMPLATE = (
|
||||
f"Software Heritage %s lister v{__version__}"
|
||||
" (+https://www.softwareheritage.org/contact)"
|
||||
)
|
||||
|
||||
LISTERS = {
|
||||
entry_point.name.split(".", 1)[1]: entry_point
|
||||
|
|
|
@ -69,7 +69,7 @@ def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
|
|||
for request in requests_mock_datadir.request_history:
|
||||
assert "User-Agent" in request.headers
|
||||
user_agent = request.headers["User-Agent"]
|
||||
assert "Software Heritage Lister" in user_agent
|
||||
assert "Software Heritage cgit lister" in user_agent
|
||||
assert __version__ in user_agent
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,6 @@ from swh.core.github.utils import GitHubSession, MissingRateLimitReset
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -87,7 +86,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
|
|||
self.relisting = self.first_id is not None or self.last_id is not None
|
||||
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials, user_agent=USER_AGENT
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# Copyright (C) 2017-2021 The Software Heritage developers
|
||||
# Copyright (C) 2017-2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
@ -11,7 +12,7 @@ from typing import Dict, List
|
|||
import pytest
|
||||
from requests.status_codes import codes
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister import USER_AGENT_TEMPLATE
|
||||
from swh.lister.gitlab.lister import GitLabLister, _parse_id_after
|
||||
from swh.lister.pattern import ListerStats
|
||||
from swh.lister.tests.test_utils import assert_sleep_calls
|
||||
|
@ -24,8 +25,8 @@ def api_url(instance: str) -> str:
|
|||
return f"https://{instance}/api/v4/"
|
||||
|
||||
|
||||
def _match_request(request):
|
||||
return request.headers.get("User-Agent") == USER_AGENT
|
||||
def _match_request(request, lister_name="gitlab"):
|
||||
return request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % lister_name
|
||||
|
||||
|
||||
def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
|
||||
|
@ -70,7 +71,7 @@ def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock):
|
|||
requests_mock.get(
|
||||
lister.page_url(),
|
||||
[{"json": response}],
|
||||
additional_matcher=_match_request,
|
||||
additional_matcher=functools.partial(_match_request, lister_name="heptapod"),
|
||||
)
|
||||
|
||||
listed_result = lister.run()
|
||||
|
|
|
@ -18,7 +18,6 @@ from swh.core.github.utils import GitHubSession
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, Lister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -95,7 +94,8 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
|
||||
self.jar_origins: Dict[str, ListedOrigin] = {}
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials, user_agent=USER_AGENT
|
||||
credentials=self.credentials,
|
||||
user_agent=str(self.session.headers["User-Agent"]),
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
|
||||
|
|
|
@ -11,7 +11,7 @@ import iso8601
|
|||
import pytest
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister import USER_AGENT_TEMPLATE
|
||||
from swh.lister.npm.lister import NpmLister, NpmListerState
|
||||
|
||||
|
||||
|
@ -53,7 +53,9 @@ def _check_listed_npm_packages(lister, packages, scheduler_origins):
|
|||
|
||||
|
||||
def _match_request(request):
|
||||
return request.headers.get("User-Agent") == USER_AGENT
|
||||
return (
|
||||
request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % NpmLister.LISTER_NAME
|
||||
)
|
||||
|
||||
|
||||
def _url_params(page_size, **kwargs):
|
||||
|
|
|
@ -18,7 +18,7 @@ from swh.core.utils import grouper
|
|||
from swh.scheduler import get_scheduler, model
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
||||
from . import USER_AGENT
|
||||
from . import USER_AGENT_TEMPLATE
|
||||
from .utils import http_retry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -124,7 +124,9 @@ class Lister(Generic[StateType, PageType]):
|
|||
|
||||
self.session = requests.Session()
|
||||
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
|
||||
self.session.headers.update({"User-Agent": USER_AGENT})
|
||||
self.session.headers.update(
|
||||
{"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME}
|
||||
)
|
||||
|
||||
@http_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
|
||||
def http_request(self, url: str, method="GET", **kwargs) -> requests.Response:
|
||||
|
|
|
@ -9,7 +9,7 @@ from pathlib import Path
|
|||
import pytest
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister import USER_AGENT_TEMPLATE
|
||||
from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url
|
||||
|
||||
|
||||
|
@ -94,7 +94,8 @@ def test_lister(
|
|||
|
||||
def match_request(request):
|
||||
return (
|
||||
request.headers.get("User-Agent") == USER_AGENT
|
||||
request.headers.get("User-Agent")
|
||||
== USER_AGENT_TEMPLATE % PhabricatorLister.LISTER_NAME
|
||||
and f"api.token={API_TOKEN}" in request.body
|
||||
)
|
||||
|
||||
|
|
|
@ -12,15 +12,8 @@ from requests.exceptions import HTTPError
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import __version__
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers
|
||||
USER_AGENT = (
|
||||
f"Software Heritage PubDev Lister v{__version__} "
|
||||
"(+https://www.softwareheritage.org/contact)"
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Aliasing the page results returned by `get_pages` method from the lister.
|
||||
|
@ -51,12 +44,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
|
|||
url=self.BASE_URL,
|
||||
)
|
||||
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Accept": "application/json",
|
||||
"User-Agent": USER_AGENT,
|
||||
}
|
||||
)
|
||||
self.session.headers.update({"Accept": "application/json"})
|
||||
|
||||
def get_pages(self) -> Iterator[PubDevListerPage]:
|
||||
"""Yield an iterator which returns 'page'
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pubdev.lister import USER_AGENT, PubDevLister
|
||||
from swh.lister import USER_AGENT_TEMPLATE
|
||||
from swh.lister.pubdev.lister import PubDevLister
|
||||
|
||||
expected_origins = {
|
||||
"https://pub.dev/packages/Autolinker",
|
||||
|
@ -29,7 +30,10 @@ def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
|
|||
|
||||
|
||||
def _match_request(request):
|
||||
return request.headers.get("User-Agent") == USER_AGENT
|
||||
return (
|
||||
request.headers.get("User-Agent")
|
||||
== USER_AGENT_TEMPLATE % PubDevLister.LISTER_NAME
|
||||
)
|
||||
|
||||
|
||||
def test_pubdev_lister_skip_package(
|
||||
|
|
|
@ -13,7 +13,7 @@ from iso8601 import iso8601
|
|||
import pytest
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister import USER_AGENT_TEMPLATE
|
||||
from swh.lister.sourceforge.lister import (
|
||||
MAIN_SITEMAP_URL,
|
||||
PROJECT_API_URL_FORMAT,
|
||||
|
@ -75,7 +75,10 @@ def get_bzr_repo_page(datadir, repo_name):
|
|||
|
||||
|
||||
def _check_request_headers(request):
|
||||
return request.headers.get("User-Agent") == USER_AGENT
|
||||
return (
|
||||
request.headers.get("User-Agent")
|
||||
== USER_AGENT_TEMPLATE % SourceForgeLister.LISTER_NAME
|
||||
)
|
||||
|
||||
|
||||
def _check_listed_origins(lister, swh_scheduler):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue