From d5c30a3ce371efa73cbabece72ce214bf18e3d3e Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Thu, 22 Sep 2022 15:43:20 +0200 Subject: [PATCH] Update value of User-Agent HTTP request header used by listers That HTTP header value will now contain the lister name but also a link to our contact form in order for sysadmins to easily reach us if needed. The following template is used to generate it: "Software Heritage lister v (+https://www.softwareheritage.org/contact)" --- swh/lister/__init__.py | 7 ++++--- swh/lister/cgit/tests/test_lister.py | 2 +- swh/lister/github/lister.py | 4 ++-- swh/lister/gitlab/tests/test_lister.py | 11 ++++++----- swh/lister/maven/lister.py | 4 ++-- swh/lister/npm/tests/test_lister.py | 6 ++++-- swh/lister/pattern.py | 6 ++++-- swh/lister/phabricator/tests/test_lister.py | 5 +++-- swh/lister/pubdev/lister.py | 14 +------------- swh/lister/pubdev/tests/test_lister.py | 8 ++++++-- swh/lister/sourceforge/tests/test_lister.py | 7 +++++-- 11 files changed, 38 insertions(+), 36 deletions(-) diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py index f4448d8..be53d8b 100644 --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -15,9 +15,10 @@ try: except pkg_resources.DistributionNotFound: __version__ = "devel" -USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)" -USER_AGENT = USER_AGENT_TEMPLATE % __version__ - +USER_AGENT_TEMPLATE = ( + f"Software Heritage %s lister v{__version__}" + " (+https://www.softwareheritage.org/contact)" +) LISTERS = { entry_point.name.split(".", 1)[1]: entry_point diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index 9d0f123..c6ffcf2 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -69,7 +69,7 @@ def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler): for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] - assert "Software Heritage Lister" in user_agent + assert "Software Heritage cgit lister" in user_agent assert __version__ in user_agent diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index acef224..ae10d71 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -15,7 +15,6 @@ from swh.core.github.utils import GitHubSession, MissingRateLimitReset from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -87,7 +86,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): self.relisting = self.first_id is not None or self.last_id is not None self.github_session = GitHubSession( - credentials=self.credentials, user_agent=USER_AGENT + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), ) def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py index 80650b8..6bbffcd 100644 --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,8 +1,9 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import functools import json import logging from pathlib import Path @@ -11,7 +12,7 @@ from typing import Dict, List import pytest from requests.status_codes import codes -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.gitlab.lister import GitLabLister, _parse_id_after from swh.lister.pattern import ListerStats from swh.lister.tests.test_utils import assert_sleep_calls @@ -24,8 +25,8 @@ def api_url(instance: str) -> str: return f"https://{instance}/api/v4/" -def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT +def _match_request(request, lister_name="gitlab"): + return request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % lister_name def test_lister_gitlab(datadir, swh_scheduler, requests_mock): @@ -70,7 +71,7 @@ def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock): requests_mock.get( lister.page_url(), [{"json": response}], - additional_matcher=_match_request, + additional_matcher=functools.partial(_match_request, lister_name="heptapod"), ) listed_result = lister.run() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index b230552..2055b91 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -18,7 +18,6 @@ from swh.core.github.utils import GitHubSession from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -95,7 +94,8 @@ class MavenLister(Lister[MavenListerState, RepoPage]): self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( - credentials=self.credentials, user_agent=USER_AGENT + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py index e8f8fa8..7c4fa93 100644 --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -11,7 +11,7 @@ import iso8601 import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.npm.lister import NpmLister, NpmListerState @@ -53,7 +53,9 @@ def _check_listed_npm_packages(lister, packages, scheduler_origins): def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % NpmLister.LISTER_NAME + ) def _url_params(page_size, **kwargs): diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 5b327e1..d188896 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -18,7 +18,7 @@ from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface -from . import USER_AGENT +from . import USER_AGENT_TEMPLATE from .utils import http_retry logger = logging.getLogger(__name__) @@ -124,7 +124,9 @@ class Lister(Generic[StateType, PageType]): self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list - self.session.headers.update({"User-Agent": USER_AGENT}) + self.session.headers.update( + {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} + ) @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index ed35435..c6e7043 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -9,7 +9,7 @@ from pathlib import Path import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url @@ -94,7 +94,8 @@ def test_lister( def match_request(request): return ( - request.headers.get("User-Agent") == USER_AGENT + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % PhabricatorLister.LISTER_NAME and f"api.token={API_TOKEN}" in request.body ) diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index 8910f39..fd1dc45 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -12,15 +12,8 @@ from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import __version__ from ..pattern import CredentialsType, StatelessLister -# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers -USER_AGENT = ( - f"Software Heritage PubDev Lister v{__version__} " - "(+https://www.softwareheritage.org/contact)" -) - logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. @@ -51,12 +44,7 @@ class PubDevLister(StatelessLister[PubDevListerPage]): url=self.BASE_URL, ) - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) + self.session.headers.update({"Accept": "application/json"}) def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py index ac2be14..5113249 100644 --- a/swh/lister/pubdev/tests/test_lister.py +++ b/swh/lister/pubdev/tests/test_lister.py @@ -3,7 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.lister.pubdev.lister import USER_AGENT, PubDevLister +from swh.lister import USER_AGENT_TEMPLATE +from swh.lister.pubdev.lister import PubDevLister expected_origins = { "https://pub.dev/packages/Autolinker", @@ -29,7 +30,10 @@ def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler): def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % PubDevLister.LISTER_NAME + ) def test_pubdev_lister_skip_package( diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py index d6aabc3..1a97bf3 100644 --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -13,7 +13,7 @@ from iso8601 import iso8601 import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.sourceforge.lister import ( MAIN_SITEMAP_URL, PROJECT_API_URL_FORMAT, @@ -75,7 +75,10 @@ def get_bzr_repo_page(datadir, repo_name): def _check_request_headers(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % SourceForgeLister.LISTER_NAME + ) def _check_listed_origins(lister, swh_scheduler):