diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -2,11 +2,13 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime import logging from pathlib import Path import re import tarfile +import tempfile from typing import Any, Dict, Iterator, List, Optional from urllib.parse import unquote, urljoin @@ -82,8 +84,6 @@ VISIT_TYPE = "arch" INSTANCE = "arch" - DESTINATION_PATH = Path("/tmp/archlinux_archive") - ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( @@ -360,109 +360,110 @@ def _get_repo_page( self, name: str, flavour: Dict[str, Any], arch: str, repo: str ) -> ArchListerPage: - page = [] - if name == "official": - prefix = urljoin(flavour["base_archive_url"], "/repos/last/") - filename = f"{repo}.files.tar.gz" - archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") - destination_path = Path(self.DESTINATION_PATH, arch, filename) - base_url = flavour["base_archive_url"] - dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN - base_info_url = flavour["base_info_url"] - info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN - elif name == "arm": - filename = f"{repo}.files.tar.gz" - archive_url = urljoin( - flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" + with tempfile.TemporaryDirectory() as tmpdir: + page = [] + if name == "official": + prefix = urljoin(flavour["base_archive_url"], "/repos/last/") + filename = f"{repo}.files.tar.gz" + archive_url = urljoin(prefix, f"{repo}/os/{arch}/{filename}") + destination_path = Path(tmpdir, arch, filename) + base_url = flavour["base_archive_url"] + dl_url_fmt = self.ARCH_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARCH_PACKAGE_URL_PATTERN + elif name == "arm": + filename = f"{repo}.files.tar.gz" + archive_url = urljoin( + flavour["base_mirror_url"], f"{arch}/{repo}/{filename}" + ) + destination_path = Path(tmpdir, arch, filename) + base_url = flavour["base_mirror_url"] + dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN + base_info_url = flavour["base_info_url"] + info_url_fmt = self.ARM_PACKAGE_URL_PATTERN + + archive = self.get_repo_archive( + url=archive_url, destination_path=destination_path ) - destination_path = Path(self.DESTINATION_PATH, arch, filename) - base_url = flavour["base_mirror_url"] - dl_url_fmt = self.ARM_PACKAGE_DOWNLOAD_URL_PATTERN - base_info_url = flavour["base_info_url"] - info_url_fmt = self.ARM_PACKAGE_URL_PATTERN - - archive = self.get_repo_archive( - url=archive_url, destination_path=destination_path - ) - - assert archive - - packages_desc = list(archive.glob("**/desc")) - logger.debug( - "Processing %(instance)s source packages info from " - "%(flavour)s %(arch)s %(repo)s repository, " - "(%(qty)s packages).", - dict( - instance=self.instance, - flavour=name, - arch=arch, - repo=repo, - qty=len(packages_desc), - ), - ) - for package_desc in packages_desc: - data = self.parse_desc_file( - path=package_desc, - repo=repo, - base_url=base_url, - dl_url_fmt=dl_url_fmt, + assert archive + + packages_desc = list(archive.glob("**/desc")) + logger.debug( + "Processing %(instance)s source packages info from " + "%(flavour)s %(arch)s %(repo)s repository, " + "(%(qty)s packages).", + dict( + instance=self.instance, + flavour=name, + arch=arch, + repo=repo, + qty=len(packages_desc), + ), ) - assert data["builddate"] - last_modified = datetime.datetime.fromtimestamp( - float(data["builddate"]), tz=datetime.timezone.utc - ) + for package_desc in packages_desc: + data = self.parse_desc_file( + path=package_desc, + repo=repo, + base_url=base_url, + dl_url_fmt=dl_url_fmt, + ) - assert data["name"] - assert data["filename"] - assert data["arch"] - url = info_url_fmt.format( - base_url=base_info_url, - pkgname=data["name"], - filename=data["filename"], - repo=repo, - arch=data["arch"], - ) + assert data["builddate"] + last_modified = datetime.datetime.fromtimestamp( + float(data["builddate"]), tz=datetime.timezone.utc + ) - assert data["version"] - if name == "official": - # find all versions of a package scrapping archive - versions = self.scrap_package_versions( - name=data["name"], repo=repo, base_url=base_url + assert data["name"] + assert data["filename"] + assert data["arch"] + url = info_url_fmt.format( + base_url=base_info_url, + pkgname=data["name"], + filename=data["filename"], + repo=repo, + arch=data["arch"], ) - elif name == "arm": - # There is no way to get related versions of a package, - # but 'data' represents the latest released version, - # use it in this case - assert data["builddate"] - assert data["csize"] - assert data["url"] - versions = [ - dict( - name=data["name"], - version=data["version"], - repo=repo, - arch=data["arch"], - filename=data["filename"], - url=data["url"], - last_modified=last_modified.replace(tzinfo=None).isoformat( - timespec="seconds" - ), - length=int(data["csize"]), - ) - ] - package = { - "name": data["name"], - "version": data["version"], - "last_modified": last_modified, - "url": url, - "versions": versions, - "data": data, - } - page.append(package) - return page + assert data["version"] + if name == "official": + # find all versions of a package scrapping archive + versions = self.scrap_package_versions( + name=data["name"], repo=repo, base_url=base_url + ) + elif name == "arm": + # There is no way to get related versions of a package, + # but 'data' represents the latest released version, + # use it in this case + assert data["builddate"] + assert data["csize"] + assert data["url"] + versions = [ + dict( + name=data["name"], + version=data["version"], + repo=repo, + arch=data["arch"], + filename=data["filename"], + url=data["url"], + last_modified=last_modified.replace(tzinfo=None).isoformat( + timespec="seconds" + ), + length=int(data["csize"]), + ) + ] + + package = { + "name": data["name"], + "version": data["version"], + "last_modified": last_modified, + "url": url, + "versions": versions, + "data": data, + } + page.append(package) + return page def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: """Iterate on all arch pages and yield ListedOrigin instances."""