Differential D8339 Diff 30132 swh/lister/arch/lister.py

Changeset View

Standalone View

swh/lister/arch/lister.py

# Copyright (C) 2022 The Software Heritage developers		# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information
import datetime		import datetime
import logging		import logging
from pathlib import Path		from pathlib import Path
import re		import re
import tarfile		import tarfile
from typing import Any, Dict, Iterator, List, Optional		from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import unquote, urljoin		from urllib.parse import unquote, urljoin

from bs4 import BeautifulSoup		from bs4 import BeautifulSoup
import requests		import requests
		from tenacity.before_sleep import before_sleep_log

		from swh.lister.utils import throttling_retry
from swh.model.hashutil import hash_to_hex		from swh.model.hashutil import hash_to_hex
from swh.scheduler.interface import SchedulerInterface		from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin		from swh.scheduler.model import ListedOrigin

		from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister		from ..pattern import CredentialsType, StatelessLister

logger = logging.getLogger(__name__)		logger = logging.getLogger(__name__)

# Aliasing the page results returned by `get_pages` method from the lister.		# Aliasing the page results returned by `get_pages` method from the lister.
ArchListerPage = List[Dict[str, Any]]		ArchListerPage = List[Dict[str, Any]]


▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	):
super().__init__(		super().__init__(
scheduler=scheduler,		scheduler=scheduler,
credentials=credentials,		credentials=credentials,
url=flavours["official"]["base_info_url"],		url=flavours["official"]["base_info_url"],
instance=self.INSTANCE,		instance=self.INSTANCE,
)		)

self.flavours = flavours		self.flavours = flavours
		self.session = requests.Session()
		self.session.headers.update(
		{
		"User-Agent": USER_AGENT,
		}
		)

		@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
		def request_get(self, url: str, params: Dict[str, Any]) -> requests.Response:

		logger.info("Fetching URL %s with params %s", url, params)

		response = self.session.get(url, params=params)
		if response.status_code != 200:
		logger.warning(
		"Unexpected HTTP status code %s on %s: %s",
		response.status_code,
		response.url,
		response.content,
		)
		response.raise_for_status()

		return response

def scrap_package_versions(		def scrap_package_versions(
self, name: str, repo: str, base_url: str		self, name: str, repo: str, base_url: str
) -> List[Dict[str, Any]]:		) -> List[Dict[str, Any]]:
"""Given a package 'name' and 'repo', make an http call to origin url and parse its content		"""Given a package 'name' and 'repo', make an http call to origin url and parse its content
to get package versions artifacts data.		to get package versions artifacts data.
That method is suitable only for 'official' Arch Linux, not 'arm'.		That method is suitable only for 'official' Arch Linux, not 'arm'.

Show All 15 Lines	) -> List[Dict[str, Any]]:
"length": 180000,		"length": 180000,
"filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz",		"filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz",
"last_modified": "2019-02-13T08:36:00"},		"last_modified": "2019-02-13T08:36:00"},
]		]
"""		"""
url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(		url = self.ARCH_PACKAGE_VERSIONS_URL_PATTERN.format(
pkgname=name, base_url=base_url		pkgname=name, base_url=base_url
)		)
soup = BeautifulSoup(requests.get(url).text, "html.parser")		response = self.request_get(url=url, params={})
		soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a", href=True)		links = soup.find_all("a", href=True)

# drop the first line (used to go to up directory)		# drop the first line (used to go to up directory)
if links[0].attrs["href"] == "../":		if links[0].attrs["href"] == "../":
links.pop(0)		links.pop(0)

versions = []		versions = []

▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	def get_repo_archive(self, url: str, destination_path: Path) -> Path:

Args:		Args:
url: url of the .tar.gz archive to download		url: url of the .tar.gz archive to download
destination_path: the path on disk where to extract archive		destination_path: the path on disk where to extract archive

Returns:		Returns:
a directory Path where the archive has been extracted to.		a directory Path where the archive has been extracted to.
"""		"""
res = requests.get(url)		res = self.request_get(url=url, params={})
destination_path.parent.mkdir(parents=True, exist_ok=True)		destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_bytes(res.content)		destination_path.write_bytes(res.content)

extract_to = Path(str(destination_path).split(".tar.gz")[0])		extract_to = Path(str(destination_path).split(".tar.gz")[0])
tar = tarfile.open(destination_path)		tar = tarfile.open(destination_path)
tar.extractall(path=extract_to)		tar.extractall(path=extract_to)
tar.close()		tar.close()

▲ Show 20 Lines • Show All 226 Lines • Show Last 20 Lines