Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/arch/lister.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
from pathlib import Path | from pathlib import Path | ||||
import re | import re | ||||
import tarfile | import tarfile | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||
from urllib.parse import unquote, urljoin | from urllib.parse import unquote, urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
import requests | import requests | ||||
from tenacity.before_sleep import before_sleep_log | |||||
from swh.lister.utils import throttling_retry | |||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from .. import USER_AGENT | |||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
# Aliasing the page results returned by `get_pages` method from the lister. | # Aliasing the page results returned by `get_pages` method from the lister. | ||||
ArchListerPage = List[Dict[str, Any]] | ArchListerPage = List[Dict[str, Any]] | ||||
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
credentials=credentials, | credentials=credentials, | ||||
url=flavours["official"]["base_info_url"], | url=flavours["official"]["base_info_url"], | ||||
instance=self.INSTANCE, | instance=self.INSTANCE, | ||||
) | ) | ||||
self.flavours = flavours | self.flavours = flavours | ||||
self.session = requests.Session() | |||||
self.session.headers.update( | |||||
{ | |||||
"User-Agent": USER_AGENT, | |||||
} | |||||
) | |||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) | |||||
def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response: | |||||
logger.info("Fetching URL %s with params %s", url, params) | |||||
response = self.session.get(url, params=params) | |||||
if response.status_code != 200: | |||||
logger.warning( | |||||
"Unexpected HTTP status code %s on %s: %s", | |||||
response.status_code, | |||||
response.url, | |||||
response.content, | |||||
) | |||||
response.raise_for_status() | |||||
return response | |||||
def scrap_package_versions( | def scrap_package_versions( | ||||
self, name: str, repo: str, base_url: str | self, name: str, repo: str, base_url: str | ||||
) -> List[Dict[str, Any]]: | ) -> List[Dict[str, Any]]: | ||||
"""Given a package 'name' and 'repo', make an http call to origin url and parse its content | """Given a package 'name' and 'repo', make an http call to origin url and parse its content | ||||
to get package versions artifacts data. | to get package versions artifacts data. | ||||
That method is suitable only for 'official' Arch Linux, not 'arm'. | That method is suitable only for 'official' Arch Linux, not 'arm'. | ||||
Show All 15 Lines | ) -> List[Dict[str, Any]]: | ||||
"length": 180000, | "length": 180000, | ||||
"filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", | "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", | ||||
"last_modified": "2019-02-13T08:36:00"}, | "last_modified": "2019-02-13T08:36:00"}, | ||||
] | ] | ||||
""" | """ | ||||
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( | url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format( | ||||
pkgname=name, base_url=base_url | pkgname=name, base_url=base_url | ||||
) | ) | ||||
soup = BeautifulSoup(requests.get(url).text, "html.parser") | response = self.request_get(url=url, params={}) | ||||
soup = BeautifulSoup(response.text, "html.parser") | |||||
links = soup.find_all("a", href=True) | links = soup.find_all("a", href=True) | ||||
# drop the first line (used to go to up directory) | # drop the first line (used to go to up directory) | ||||
if links[0].attrs["href"] == "../": | if links[0].attrs["href"] == "../": | ||||
links.pop(0) | links.pop(0) | ||||
versions = [] | versions = [] | ||||
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines | def get_repo_archive(self, url: str, destination_path: Path) -> Path: | ||||
Args: | Args: | ||||
url: url of the .tar.gz archive to download | url: url of the .tar.gz archive to download | ||||
destination_path: the path on disk where to extract archive | destination_path: the path on disk where to extract archive | ||||
Returns: | Returns: | ||||
a directory Path where the archive has been extracted to. | a directory Path where the archive has been extracted to. | ||||
""" | """ | ||||
res = requests.get(url) | res = self.request_get(url=url, params={}) | ||||
destination_path.parent.mkdir(parents=True, exist_ok=True) | destination_path.parent.mkdir(parents=True, exist_ok=True) | ||||
destination_path.write_bytes(res.content) | destination_path.write_bytes(res.content) | ||||
extract_to = Path(str(destination_path).split(".tar.gz")[0]) | extract_to = Path(str(destination_path).split(".tar.gz")[0]) | ||||
tar = tarfile.open(destination_path) | tar = tarfile.open(destination_path) | ||||
tar.extractall(path=extract_to) | tar.extractall(path=extract_to) | ||||
tar.close() | tar.close() | ||||
▲ Show 20 Lines • Show All 226 Lines • Show Last 20 Lines |