diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -12,11 +12,14 @@ from bs4 import BeautifulSoup import requests +from tenacity.before_sleep import before_sleep_log +from swh.lister.utils import throttling_retry from swh.model.hashutil import hash_to_hex from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin +from .. import USER_AGENT from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) @@ -122,6 +125,29 @@ ) self.flavours = flavours + self.session = requests.Session() + self.session.headers.update( + { + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response def scrap_package_versions( self, name: str, repo: str, base_url: str @@ -153,7 +179,8 @@ url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( pkgname=name, base_url=base_url ) - soup = BeautifulSoup(requests.get(url).text, "html.parser") + response = self.request_get(url=url, params={}) + soup = BeautifulSoup(response.text, "html.parser") links = soup.find_all("a", href=True) # drop the first line (used to go to up directory) @@ -236,7 +263,7 @@ Returns: a directory Path where the archive has been extracted to. """ - res = requests.get(url) + res = self.request_get(url=url, params={}) destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(res.content)