Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123272
D8339.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Subscribers
None
D8339.diff
View Options
diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py
--- a/swh/lister/arch/lister.py
+++ b/swh/lister/arch/lister.py
@@ -12,11 +12,14 @@
from bs4 import BeautifulSoup
import requests
+from tenacity.before_sleep import before_sleep_log
+from swh.lister.utils import throttling_retry
from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
+from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
@@ -122,6 +125,29 @@
)
self.flavours = flavours
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "User-Agent": USER_AGENT,
+ }
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
def scrap_package_versions(
self, name: str, repo: str, base_url: str
@@ -153,7 +179,8 @@
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url
)
- soup = BeautifulSoup(requests.get(url).text, "html.parser")
+ response = self.request_get(url=url, params={})
+ soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)
# drop the first line (used to go to up directory)
@@ -236,7 +263,7 @@
Returns:
a directory Path where the archive has been extracted to.
"""
- res = requests.get(url)
+ res = self.request_get(url=url, params={})
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 9:58 AM (19 h, 57 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219565
Attached To
D8339: Arch: Add throttling retry for scrapping and resources download
Event Timeline
Log In to Comment