diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -15,9 +15,10 @@ except pkg_resources.DistributionNotFound: __version__ = "devel" -USER_AGENT_TEMPLATE = "Software Heritage Lister (%s)" -USER_AGENT = USER_AGENT_TEMPLATE % __version__ - +USER_AGENT_TEMPLATE = ( + f"Software Heritage %s lister v{__version__}" + " (+https://www.softwareheritage.org/contact)" +) LISTERS = { entry_point.name.split(".", 1)[1]: entry_point diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -69,7 +69,7 @@ for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] - assert "Software Heritage Lister" in user_agent + assert "Software Heritage cgit lister" in user_agent assert __version__ in user_agent diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -15,7 +15,6 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -87,7 +86,8 @@ self.relisting = self.first_id is not None or self.last_id is not None self.github_session = GitHubSession( - credentials=self.credentials, user_agent=USER_AGENT + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), ) def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState: diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,8 +1,9 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import functools import json import logging from pathlib import Path @@ -11,7 +12,7 @@ import pytest from requests.status_codes import codes -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.gitlab.lister import GitLabLister, _parse_id_after from swh.lister.pattern import ListerStats from swh.lister.tests.test_utils import assert_sleep_calls @@ -24,8 +25,8 @@ return f"https://{instance}/api/v4/" -def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT +def _match_request(request, lister_name="gitlab"): + return request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % lister_name def test_lister_gitlab(datadir, swh_scheduler, requests_mock): @@ -70,7 +71,7 @@ requests_mock.get( lister.page_url(), [{"json": response}], - additional_matcher=_match_request, + additional_matcher=functools.partial(_match_request, lister_name="heptapod"), ) listed_result = lister.run() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -18,7 +18,6 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -95,7 +94,8 @@ self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( - credentials=self.credentials, user_agent=USER_AGENT + credentials=self.credentials, + user_agent=str(self.session.headers["User-Agent"]), ) def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState: diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -11,7 +11,7 @@ import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.npm.lister import NpmLister, NpmListerState @@ -53,7 +53,9 @@ def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") == USER_AGENT_TEMPLATE % NpmLister.LISTER_NAME + ) def _url_params(page_size, **kwargs): diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -18,7 +18,7 @@ from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface -from . import USER_AGENT +from . import USER_AGENT_TEMPLATE from .utils import http_retry logger = logging.getLogger(__name__) @@ -124,7 +124,9 @@ self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list - self.session.headers.update({"User-Agent": USER_AGENT}) + self.session.headers.update( + {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} + ) @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -9,7 +9,7 @@ import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.phabricator.lister import PhabricatorLister, get_repo_url @@ -94,7 +94,8 @@ def match_request(request): return ( - request.headers.get("User-Agent") == USER_AGENT + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % PhabricatorLister.LISTER_NAME and f"api.token={API_TOKEN}" in request.body ) diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -12,15 +12,8 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import __version__ from ..pattern import CredentialsType, StatelessLister -# https://github.com/dart-lang/pub/blob/master/doc/repository-spec-v2.md#metadata-headers -USER_AGENT = ( - f"Software Heritage PubDev Lister v{__version__} " - "(+https://www.softwareheritage.org/contact)" -) - logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. @@ -51,12 +44,7 @@ url=self.BASE_URL, ) - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) + self.session.headers.update({"Accept": "application/json"}) def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' diff --git a/swh/lister/pubdev/tests/test_lister.py b/swh/lister/pubdev/tests/test_lister.py --- a/swh/lister/pubdev/tests/test_lister.py +++ b/swh/lister/pubdev/tests/test_lister.py @@ -3,7 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.lister.pubdev.lister import USER_AGENT, PubDevLister +from swh.lister import USER_AGENT_TEMPLATE +from swh.lister.pubdev.lister import PubDevLister expected_origins = { "https://pub.dev/packages/Autolinker", @@ -29,7 +30,10 @@ def _match_request(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % PubDevLister.LISTER_NAME + ) def test_pubdev_lister_skip_package( diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -13,7 +13,7 @@ import pytest from requests.exceptions import HTTPError -from swh.lister import USER_AGENT +from swh.lister import USER_AGENT_TEMPLATE from swh.lister.sourceforge.lister import ( MAIN_SITEMAP_URL, PROJECT_API_URL_FORMAT, @@ -75,7 +75,10 @@ def _check_request_headers(request): - return request.headers.get("User-Agent") == USER_AGENT + return ( + request.headers.get("User-Agent") + == USER_AGENT_TEMPLATE % SourceForgeLister.LISTER_NAME + ) def _check_listed_origins(lister, swh_scheduler):