diff --git a/docs/new_lister_template.py b/docs/new_lister_template.py --- a/docs/new_lister_template.py +++ b/docs/new_lister_template.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,14 +8,9 @@ from typing import Any, Dict, Iterator, List from urllib.parse import urljoin -import requests -from tenacity.before_sleep import before_sleep_log - -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -65,11 +60,7 @@ instance=instance, ) - self.session = requests.Session() - # Declare the USER_AGENT is more sysadm-friendly for the forge we list - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState: return NewForgeListerState(**d) @@ -77,32 +68,6 @@ def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]: return asdict(state) - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url, params) -> requests.Response: - # Do the network resource request under a retrying decorator - # to handle rate limiting and transient errors up to a limit. - # `http_retry` by default use the `requests` library to check - # only for rate-limit and a base-10 exponential waiting strategy. - # This can be customized by passed waiting, retrying and logging strategies - # as functions. See the `tenacity` library documentation. - - # Log listed URL to ease debugging - logger.debug("Fetching URL %s with params %s", url, params) - response = self.session.get(url, params=params) - - if response.status_code != 200: - # Log response content to ease debugging - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - # The lister must fail on blocking errors - response.raise_for_status() - - return response - def get_pages(self) -> Iterator[NewForgeListerPage]: # The algorithm depends on the service, but should request data reliably, # following pagination if relevant and yielding pages in a streaming fashion. @@ -120,7 +85,7 @@ while current is not None: # Parametrize the request for incremental listing - body = self.page_request(url, {"current": current}).json() + body = self.http_request(url, params={"current": current}).json() # Simplify the page if possible to only the necessary elements # and yield it diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py --- a/swh/lister/__init__.py +++ b/swh/lister/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,8 +7,6 @@ import pkg_resources -from swh.lister import pattern - logger = logging.getLogger(__name__) @@ -53,6 +51,9 @@ registry_entry = LISTERS[lister_name].load()() lister_cls = registry_entry["lister"] + + from swh.lister import pattern + if issubclass(lister_cls, pattern.Lister): return lister_cls.from_config(**conf) else: diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -13,15 +13,11 @@ from urllib.parse import unquote, urljoin from bs4 import BeautifulSoup -import requests -from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) @@ -125,29 +121,6 @@ ) self.flavours = flavours - self.session = requests.Session() - self.session.headers.update( - { - "User-Agent": USER_AGENT, - } - ) - - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.debug("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response def scrap_package_versions( self, name: str, repo: str, base_url: str @@ -179,7 +152,7 @@ url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( pkgname=name, base_url=base_url ) - response = self.request_get(url=url, params={}) + response = self.http_request(url) soup = BeautifulSoup(response.text, "html.parser") links = soup.find_all("a", href=True) @@ -263,7 +236,7 @@ Returns: a directory Path where the archive has been extracted to. """ - res = self.request_get(url=url, params={}) + res = self.http_request(url) destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(res.content) diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -7,8 +7,6 @@ import logging from typing import Any, Dict, Iterator, List, Optional -import requests - from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -65,7 +63,7 @@ a directory Path where the archive has been downloaded to. """ url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) - return requests.get(url).json() + return self.http_request(url).json() def get_pages(self) -> Iterator[AurListerPage]: """Yield an iterator which returns 'page' diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,14 +11,10 @@ from urllib import parse import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -77,10 +73,7 @@ ), } - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) if len(self.credentials) > 0: cred = random.choice(self.credentials) @@ -107,25 +100,6 @@ if username is not None and password is not None: self.session.auth = (username, password) - @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) - def page_request(self, last_repo_cdate: str) -> requests.Response: - - self.url_params["after"] = last_repo_cdate - logger.debug("Fetching URL %s with params %s", self.url, self.url_params) - - response = self.session.get(self.url, params=self.url_params) - - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response - def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_repo_cdate: str = "1970-01-01" @@ -137,7 +111,8 @@ last_repo_cdate = self.state.last_repo_cdate.isoformat() while True: - body = self.page_request(last_repo_cdate).json() + self.url_params["after"] = last_repo_cdate + body = self.http_request(self.url, params=self.url_params).json() yield body["values"] diff --git a/swh/lister/bitbucket/tests/test_lister.py b/swh/lister/bitbucket/tests/test_lister.py --- a/swh/lister/bitbucket/tests/test_lister.py +++ b/swh/lister/bitbucket/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -69,14 +69,16 @@ assert last_repo_cdate == bb_api_repositories_page2["values"][-1]["created_on"] # Second listing, restarting from last state - lister.session.get = mocker.spy(lister.session, "get") + lister.session.request = mocker.spy(lister.session, "request") lister.run() url_params = lister.url_params url_params["after"] = last_repo_cdate - lister.session.get.assert_called_once_with(lister.API_URL, params=url_params) + lister.session.request.assert_called_once_with( + "GET", lister.API_URL, params=url_params + ) all_origins = ( bb_api_repositories_page1["values"] + bb_api_repositories_page2["values"] @@ -106,7 +108,7 @@ lister = BitbucketLister(scheduler=swh_scheduler, page_size=10) - mocker.patch.object(lister.page_request.retry, "sleep") + mocker.patch.object(lister.http_request.retry, "sleep") stats = lister.run() diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -2,17 +2,13 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging -from typing import Any, Dict, Iterator, List, Optional -import requests -from tenacity.before_sleep import before_sleep_log +import logging +from typing import Dict, Iterator, List, Optional -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) @@ -41,30 +37,7 @@ instance=self.INSTANCE, url=self.API_URL, ) - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) - - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.info("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response + self.session.headers.update({"Accept": "application/json"}) def get_pages(self) -> Iterator[BowerListerPage]: """Yield an iterator which returns 'page' @@ -75,7 +48,7 @@ There is only one page that list all origins urls. """ - response = self.page_request(url=self.url, params={}) + response = self.http_request(self.url) yield response.json() def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]: diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,13 +9,9 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup -import requests from requests.exceptions import HTTPError -from tenacity.before_sleep import before_sleep_log -from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -73,17 +69,12 @@ credentials=credentials, ) - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/html", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/html"}) self.base_git_url = base_git_url - @http_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" - response = self.session.get(url) - response.raise_for_status() + response = self.http_request(url) return BeautifulSoup(response.text, features="html.parser") def get_pages(self) -> Iterator[Repositories]: diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -258,7 +258,7 @@ lister_cgit = CGitLister(swh_scheduler, url=url) - mocker.patch.object(lister_cgit._get_and_parse.retry, "sleep") + mocker.patch.object(lister_cgit.http_request.retry, "sleep") repos: List[List[str]] = list(lister_cgit.get_pages()) flattened_repos = sum(repos, []) diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -1,9 +1,8 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - import bz2 from collections import defaultdict from dataclasses import dataclass, field @@ -17,12 +16,11 @@ from urllib.parse import urljoin from debian.deb822 import Sources -import requests +from requests.exceptions import HTTPError from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -95,9 +93,6 @@ self.suites = suites self.components = components - self.session = requests.Session() - self.session.headers.update({"User-Agent": USER_AGENT}) - # will hold all listed origins info self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {} # will contain origin urls that have already been listed @@ -132,9 +127,11 @@ def page_request(self, suite: Suite, component: Component) -> DebianPageType: """Return parsed package Sources file for a given debian suite and component.""" for url, compression in self.debian_index_urls(suite, component): - response = requests.get(url, stream=True) - logging.debug("Fetched URL: %s, status code: %s", url, response.status_code) - if response.status_code == 200: + try: + response = self.http_request(url, stream=True) + except HTTPError: + pass + else: last_modified = response.headers.get("Last-Modified") self.last_sources_update = ( parsedate_to_datetime(last_modified) if last_modified else None diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,12 +10,10 @@ from urllib.parse import parse_qs, urlencode, urlparse import iso8601 -import requests from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log -from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister from swh.lister.utils import http_retry, is_retryable_exception from swh.scheduler.model import ListedOrigin @@ -118,10 +116,7 @@ self.last_page: Optional[str] = None self.per_page = 100 - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) if len(self.credentials) > 0: cred = random.choice(self.credentials) diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -2,6 +2,7 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from dataclasses import asdict, dataclass import logging import random @@ -9,14 +10,11 @@ from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log +from requests.exceptions import HTTPError -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -96,13 +94,7 @@ # Raises an error on Gogs, or a warning on Gitea self.on_anonymous_mode() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) + self.session.headers.update({"Accept": "application/json"}) if self.api_token: self.session.headers["Authorization"] = f"token {self.api_token}" @@ -116,34 +108,27 @@ def state_to_dict(self, state: GogsListerState) -> Dict[str, Any]: return asdict(state) - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request( self, url: str, params: Dict[str, Any] ) -> Tuple[Dict[str, Any], Dict[str, Any]]: logger.debug("Fetching URL %s with params %s", url, params) - response = self.session.get(url, params=params) - - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - if ( - response.status_code == 500 - ): # Temporary hack for skipping fatal repos (T4423) - url_parts = urlparse(url) - query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) - query.update({"page": _parse_page_id(url) + 1}) - next_page_link = url_parts._replace(query=urlencode(query)).geturl() - body: Dict[str, Any] = {"data": []} - links = {"next": {"url": next_page_link}} - return body, links - else: - response.raise_for_status() + try: + response = self.http_request(url, params=params) + except HTTPError as http_error: + if ( + http_error.response.status_code == 500 + ): # Temporary hack for skipping fatal repos (T4423) + url_parts = urlparse(url) + query: Dict[str, Any] = dict(parse_qsl(url_parts.query)) + query.update({"page": _parse_page_id(url) + 1}) + next_page_link = url_parts._replace(query=urlencode(query)).geturl() + body: Dict[str, Any] = {"data": []} + links = {"next": {"url": next_page_link}} + return body, links + else: + raise return response.json(), response.links @@ -159,6 +144,7 @@ # base with trailing slash, path without leading slash for urljoin next_link: Optional[str] = urljoin(self.url, self.REPO_LIST_PATH) + assert next_link is not None body, links = self.page_request( next_link, {**self.query_params, "page": page_id} diff --git a/swh/lister/gogs/tests/test_lister.py b/swh/lister/gogs/tests/test_lister.py --- a/swh/lister/gogs/tests/test_lister.py +++ b/swh/lister/gogs/tests/test_lister.py @@ -264,7 +264,7 @@ p3_text, p3_headers, p3_result, p3_origin_urls = trygogs_p3_last requests_mock.get(P3, text=p3_text, headers=p3_headers) - lister.session.get = mocker.spy(lister.session, "get") + lister.session.request = mocker.spy(lister.session, "request") attempt2_stats = lister.run() @@ -277,8 +277,8 @@ query_params = lister.query_params query_params["page"] = page_id - lister.session.get.assert_called_once_with( - TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params + lister.session.request.assert_called_once_with( + "GET", TRY_GOGS_URL + lister.REPO_LIST_PATH, params=query_params ) # All the 9 origins (3 pages) should be passed on to the scheduler: diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -10,14 +10,10 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 -import requests -from tenacity import before_sleep_log -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -59,10 +55,7 @@ credentials=credentials, ) - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) self.incremental = incremental def state_from_dict(self, d: Dict[str, Any]) -> GolangStateType: @@ -87,24 +80,8 @@ ): self.updated = True - @http_retry( - before_sleep=before_sleep_log(logger, logging.WARNING), - ) def api_request(self, url: str) -> List[str]: - logger.debug("Fetching URL %s", url) - - response = self.session.get(url) - - if response.status_code not in (200, 304): - # Log response content to ease debugging - logger.warning( - "Unexpected HTTP status code %s for URL %s", - response.status_code, - response.url, - ) - - response.raise_for_status() - + response = self.http_request(url) return response.text.split() def get_single_page( diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py --- a/swh/lister/golang/tests/test_lister.py +++ b/swh/lister/golang/tests/test_lister.py @@ -98,11 +98,12 @@ def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): - # first listing, should return one origin per package - lister = GolangLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep - mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + mocked_sleep = mocker.patch.object(GolangLister.http_request.retry, "sleep") + + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler) _generate_responses(datadir, requests_mock) @@ -131,7 +132,7 @@ # doing it all again (without incremental) should give us the same result lister = GolangLister(scheduler=swh_scheduler) - mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + _generate_responses(datadir, requests_mock) stats = lister.run() diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -13,10 +13,8 @@ from bs4 import BeautifulSoup import lxml import requests -from tenacity.before_sleep import before_sleep_log from swh.core.github.utils import GitHubSession -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -93,13 +91,7 @@ instance=instance, ) - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) + self.session.headers.update({"Accept": "application/json"}) self.jar_origins: Dict[str, ListedOrigin] = {} self.github_session = GitHubSession( @@ -112,23 +104,6 @@ def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]: return asdict(state) - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.info("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response - def get_pages(self) -> Iterator[RepoPage]: """Retrieve and parse exported maven indexes to identify all pom files and src archives. @@ -155,10 +130,11 @@ # Download the main text index file. logger.info("Downloading computed index from %s.", self.INDEX_URL) assert self.INDEX_URL is not None - response = requests.get(self.INDEX_URL, stream=True) - if response.status_code != 200: + try: + response = self.http_request(self.INDEX_URL, stream=True) + except requests.HTTPError: logger.error("Index %s not found, stopping", self.INDEX_URL) - response.raise_for_status() + raise # Prepare regexes to parse index exports. @@ -250,9 +226,9 @@ # Now fetch pom files and scan them for scm info. logger.info("Fetching poms..") - for pom in out_pom: + for pom_url in out_pom: try: - response = self.page_request(pom, {}) + response = self.http_request(pom_url) parsed_pom = BeautifulSoup(response.content, "xml") project = parsed_pom.find("project") if project is None: @@ -263,22 +239,24 @@ if connection is not None: artifact_metadata_d = { "type": "scm", - "doc": out_pom[pom], + "doc": out_pom[pom_url], "url": connection.text, } - logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d) + logger.debug( + "* Yielding pom %s: %s", pom_url, artifact_metadata_d + ) yield artifact_metadata_d else: - logger.debug("No scm.connection in pom %s", pom) + logger.debug("No scm.connection in pom %s", pom_url) else: - logger.debug("No scm in pom %s", pom) + logger.debug("No scm in pom %s", pom_url) except requests.HTTPError: logger.warning( "POM info page could not be fetched, skipping project '%s'", - pom, + pom_url, ) except lxml.etree.Error as error: - logger.info("Could not parse POM %s XML: %s.", pom, error) + logger.info("Could not parse POM %s XML: %s.", pom_url, error) def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -127,7 +127,7 @@ @pytest.fixture(autouse=True) def retry_sleep_mock(mocker): - mocker.patch.object(MavenLister.page_request.retry, "sleep") + mocker.patch.object(MavenLister.http_request.retry, "sleep") def test_maven_full_listing(swh_scheduler): diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 the Software Heritage developers +# Copyright (C) 2018-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,12 +7,8 @@ from typing import Any, Dict, Iterator, List, Optional import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log -from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -75,10 +71,7 @@ self.page_size += 1 self.incremental = incremental - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) def state_from_dict(self, d: Dict[str, Any]) -> NpmListerState: return NpmListerState(**d) @@ -95,21 +88,6 @@ params["startkey"] = last_package_id return params - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, last_package_id: str) -> requests.Response: - params = self.request_params(last_package_id) - logger.debug("Fetching URL %s with params %s", self.url, params) - response = self.session.get(self.url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - return response - def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_package_id: str = "0" if self.incremental else '""' if ( @@ -121,7 +99,9 @@ while True: - response = self.page_request(last_package_id) + response = self.http_request( + self.url, params=self.request_params(last_package_id) + ) data = response.json() page = data["results"] if self.incremental else data["rows"] diff --git a/swh/lister/npm/tests/test_lister.py b/swh/lister/npm/tests/test_lister.py --- a/swh/lister/npm/tests/test_lister.py +++ b/swh/lister/npm/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -37,7 +37,7 @@ @pytest.fixture(autouse=True) def retry_sleep_mock(mocker): - mocker.patch.object(NpmLister.page_request.retry, "sleep") + mocker.patch.object(NpmLister.http_request.retry, "sleep") def _check_listed_npm_packages(lister, packages, scheduler_origins): @@ -78,19 +78,21 @@ additional_matcher=_match_request, ) - spy_get = mocker.spy(lister.session, "get") + spy_request = mocker.spy(lister.session, "request") stats = lister.run() assert stats.pages == 2 assert stats.origins == page_size * stats.pages - spy_get.assert_has_calls( + spy_request.assert_has_calls( [ mocker.call( + "GET", lister.API_FULL_LISTING_URL, params=_url_params(page_size + 1, startkey='""'), ), mocker.call( + "GET", lister.API_FULL_LISTING_URL, params=_url_params( page_size + 1, @@ -132,7 +134,7 @@ additional_matcher=_match_request, ) - spy_get = mocker.spy(lister.session, "get") + spy_request = mocker.spy(lister.session, "request") assert lister.get_state_from_scheduler() == NpmListerState() @@ -142,13 +144,15 @@ last_seq = npm_incremental_listing_page2["results"][-1]["seq"] - spy_get.assert_has_calls( + spy_request.assert_has_calls( [ mocker.call( + "GET", lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since="0"), ), mocker.call( + "GET", lister.API_INCREMENTAL_LISTING_URL, params=_url_params( page_size, @@ -156,6 +160,7 @@ ), ), mocker.call( + "GET", lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since=str(last_seq)), ), @@ -189,11 +194,12 @@ requests_mock.get(lister.API_INCREMENTAL_LISTING_URL, json={"results": []}) - spy_get = mocker.spy(lister.session, "get") + spy_request = mocker.spy(lister.session, "request") lister.run() - spy_get.assert_called_with( + spy_request.assert_called_with( + "GET", lister.API_INCREMENTAL_LISTING_URL, params=_url_params(page_size, since=str(last_seq)), ) diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,7 +14,6 @@ from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -62,10 +61,7 @@ credentials=credentials, ) - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) self.listing_date = datetime.now().astimezone(tz=timezone.utc) def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: @@ -82,20 +78,7 @@ return d def api_request(self, url: str) -> Any: - logger.debug("Fetching URL %s", url) - - response = self.session.get(url) - - if response.status_code not in (200, 304): - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - - response.raise_for_status() - + response = self.http_request(url) # response is empty when status code is 304 return response.json() if response.status_code == 200 else {} @@ -134,7 +117,7 @@ # missing package metadata in response continue versions_info = metadata["packages"][package_name].values() - except requests.exceptions.HTTPError: + except requests.HTTPError: # error when getting package metadata (usually 404 when a # package has been removed), skip it and process next package continue diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,14 +6,23 @@ from __future__ import annotations from dataclasses import dataclass +import logging from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar from urllib.parse import urlparse +import requests +from tenacity.before_sleep import before_sleep_log + from swh.core.config import load_from_envvar from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface +from . import USER_AGENT +from .utils import http_retry + +logger = logging.getLogger(__name__) + @dataclass class ListerStats: @@ -113,6 +122,27 @@ self.state = self.get_state_from_scheduler() self.updated = False + self.session = requests.Session() + # Declare the USER_AGENT is more sysadm-friendly for the forge we list + self.session.headers.update({"User-Agent": USER_AGENT}) + + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: + + logger.debug("Fetching URL %s with params %s", url, kwargs.get("params")) + + response = self.session.request(method, url, **kwargs) + if response.status_code not in (200, 304): + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + def run(self) -> ListerStats: """Run the lister. diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,15 +1,13 @@ -# Copyright (C) 2019-2021 the Software Heritage developers +# Copyright (C) 2019-2022 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from collections import defaultdict import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin -import requests - -from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -47,10 +45,7 @@ scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials ) - self.session = requests.Session() - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + self.session.headers.update({"Accept": "application/json"}) if api_token is not None: self.api_token = api_token @@ -91,22 +86,7 @@ after: Optional[str] = None while True: params = self.get_request_params(after) - logger.debug( - "Retrieving results on URI %s with parameters %s", - self.url, - self.filter_params(params), - ) - response = self.session.post(self.url, data=params) - - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - - response.raise_for_status() + response = self.http_request(self.url, method="POST", data=params) response_data = response.json() diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -27,6 +27,11 @@ ) +@pytest.fixture(autouse=True) +def retry_sleep_mock(mocker): + mocker.patch.object(PhabricatorLister.http_request.retry, "sleep") + + def test_get_repo_url(phabricator_repositories_page1): repos = phabricator_repositories_page1["result"]["data"] for repo in repos: diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -2,15 +2,13 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import logging -from typing import Any, Dict, Iterator, List, Optional +from typing import Iterator, List, Optional import iso8601 -import requests from requests.exceptions import HTTPError -from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -52,7 +50,7 @@ instance=self.INSTANCE, url=self.BASE_URL, ) - self.session = requests.Session() + self.session.headers.update( { "Accept": "application/json", @@ -60,23 +58,6 @@ } ) - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.debug("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response - def get_pages(self) -> Iterator[PubDevListerPage]: """Yield an iterator which returns 'page' @@ -88,8 +69,8 @@ There is only one page that list all origins url based on "{base_url}packages/{pkgname}" """ - response = self.page_request( - url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url), params={} + response = self.http_request( + url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url) ) yield response.json()["packages"] @@ -102,7 +83,7 @@ base_url=self.url, pkgname=pkgname ) try: - response = self.page_request(url=package_info_url, params={}) + response = self.http_request(url=package_info_url) except HTTPError: logger.warning( "Failed to fetch metadata for package %s, skipping it from listing.", diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -15,14 +15,11 @@ import iso8601 import lxml import requests -from tenacity.before_sleep import before_sleep_log from swh.core.api.classes import stream_results -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @@ -127,11 +124,8 @@ # Will hold the currently saved "last modified" dates to compare against our # requests. self._project_last_modified: Optional[ProjectsLastModifiedCache] = None - self.session = requests.Session() - # Declare the USER_AGENT is more sysadm-friendly for the forge we list - self.session.headers.update( - {"Accept": "application/json", "User-Agent": USER_AGENT} - ) + + self.session.headers.update({"Accept": "application/json"}) self.incremental = incremental def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: @@ -208,26 +202,6 @@ self._project_last_modified = listed_origins return listed_origins - @http_retry( - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - def page_request(self, url, params) -> requests.Response: - # Log listed URL to ease debugging - logger.debug("Fetching URL %s with params %s", url, params) - response = self.session.get(url, params=params) - - if response.status_code != 200: - # Log response content to ease debugging - logger.warning( - "Unexpected HTTP status code %s for URL %s", - response.status_code, - response.url, - ) - # The lister must fail on blocking errors - response.raise_for_status() - - return response - def get_pages(self) -> Iterator[SourceForgeListerPage]: """ SourceForge has a main XML sitemap that lists its sharded sitemaps for all @@ -240,7 +214,7 @@ Lastly we use the information of which VCS are used to build the predictable clone URL for any given VCS. """ - sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text + sitemap_contents = self.http_request(MAIN_SITEMAP_URL).text tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): @@ -259,7 +233,7 @@ continue self.state.subsitemap_last_modified[sub_url] = last_modified - subsitemap_contents = self.page_request(sub_url, {}).text + subsitemap_contents = self.http_request(sub_url).text subtree = ElementTree.fromstring(subsitemap_contents) yield from self._get_pages_from_subsitemap(subtree) @@ -351,9 +325,9 @@ logger.debug(msg, namespace, project) try: - res = self.page_request(endpoint, {}).json() + res = self.http_request(endpoint).json() except requests.HTTPError: - # We've already logged in `page_request` + # We've already logged in `http_request` return [] tools = res.get("tools") @@ -373,7 +347,7 @@ # and multiple origin URLs can be produced for a same project. cvs_info_url = f"http://{project}.cvs.sourceforge.net" try: - response = self.page_request(cvs_info_url, params={}) + response = self.http_request(cvs_info_url) except requests.HTTPError: logger.warning( "CVS info page could not be fetched, skipping project '%s'", @@ -413,7 +387,7 @@ # and a lot of them are 404 now. url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" try: - response = self.page_request(url, params={}) + response = self.http_request(url) if "To get this branch, use:" not in response.text: # If a bzr project has multiple branches, we need to extract their # names from the repository landing page and create one listed origin diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -1,7 +1,8 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime import functools import json @@ -368,7 +369,7 @@ lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep - mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep") + mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep") requests_mock.get( MAIN_SITEMAP_URL, @@ -438,7 +439,7 @@ lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep - mocked_sleep = mocker.patch.object(lister.page_request.retry, "sleep") + mocked_sleep = mocker.patch.object(lister.http_request.retry, "sleep") requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code) @@ -458,7 +459,7 @@ ): lister = SourceForgeLister(scheduler=swh_scheduler) # Exponential retries take a long time, so stub time.sleep - mocker.patch.object(lister.page_request.retry, "sleep") + mocker.patch.object(lister.http_request.retry, "sleep") requests_mock.get( MAIN_SITEMAP_URL, diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,14 +8,10 @@ from urllib.parse import urljoin import iso8601 -import requests -from tenacity.before_sleep import before_sleep_log -from swh.lister.utils import http_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) @@ -57,30 +53,7 @@ instance=instance, ) - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "User-Agent": USER_AGENT, - } - ) - - @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) - def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: - - logger.info("Fetching URL %s with params %s", url, params) - - response = self.session.get(url, params=params) - if response.status_code != 200: - logger.warning( - "Unexpected HTTP status code %s on %s: %s", - response.status_code, - response.url, - response.content, - ) - response.raise_for_status() - - return response + self.session.headers.update({"Accept": "application/json"}) @classmethod def results_simplified(cls, url: str, repo_type: str, repo: RepoPage) -> RepoPage: @@ -97,14 +70,14 @@ return rep def _get_repositories(self, url_repo) -> List[Dict[str, Any]]: - ret = self.page_request(url_repo, {}) + ret = self.http_request(url_repo) reps_list = ret.json()["repositories"] limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(ret.headers["X-PAGINATION-LIMIT"]) size = int(ret.headers["X-PAGINATION-SIZE"]) while offset < size: url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(limit) - ret = self.page_request(url_offset, {}).json() + ret = self.http_request(url_offset).json() reps_list = reps_list + ret["repositories"] offset += limit return reps_list @@ -115,7 +88,7 @@ url_projects = url_api + "/projects/" # Get the list of projects. - response = self.page_request(url_projects, {}) + response = self.http_request(url_projects) projects_list = response.json() limit = int(response.headers["X-PAGINATION-LIMIT-MAX"]) offset = int(response.headers["X-PAGINATION-LIMIT"]) @@ -124,7 +97,7 @@ url_offset = ( url_projects + "?offset=" + str(offset) + "&limit=" + str(limit) ) - ret = self.page_request(url_offset, {}).json() + ret = self.http_request(url_offset).json() projects_list = projects_list + ret offset += limit diff --git a/swh/lister/tuleap/tests/test_lister.py b/swh/lister/tuleap/tests/test_lister.py --- a/swh/lister/tuleap/tests/test_lister.py +++ b/swh/lister/tuleap/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -94,7 +94,7 @@ @pytest.fixture(autouse=True) def retry_sleep_mock(mocker): - mocker.patch.object(TuleapLister.page_request.retry, "sleep") + mocker.patch.object(TuleapLister.http_request.retry, "sleep") def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):