Page MenuHomeSoftware Heritage

D8339.diff
No OneTemporary

D8339.diff

diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@@ -12,11 +12,14 @@
from bs4 import BeautifulSoup
import requests
+from tenacity.before_sleep import before_sleep_log
+from swh.lister.utils import throttling_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
+from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@@ -122,6 +125,29 @@
)
self.flavours = flavours
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "User-Agent": USER_AGENT,
+ }
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
def scrap_package_versions(
self, name: str, repo: str, base_url: str
@@ -153,7 +179,8 @@
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url
)
- soup = BeautifulSoup(requests.get(url).text, "html.parser")
+ response = self.request_get(url=url, params={})
+ soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
# drop the first line (used to go to up directory)
@@ -236,7 +263,7 @@
Returns:
a directory Path where the archive has been extracted to.
"""
- res = requests.get(url)
+ res = self.request_get(url=url, params={})
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 9:58 AM (19 h, 57 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219565

Event Timeline