diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -20,6 +20,15 @@ - author - date - Notes + * - arch + - ``p_info.​version`` + - ``release_name(​version, filename)`` + - =version + - Synthetic release for Arch Linux source package {p_info.name} version {p_info.version} {description} + - true + - from intrinsic metadata + - from extra_loader_arguments['artifacts'] + - Intrinsic metadata extracted from .PKGINFO file of the package * - archive - passed as arg - ``release_name(​version)`` diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ -swh.core >= 0.3 +swh.core >= 2.12 swh.model >= 4.4.0 swh.objstorage >= 0.2.2 swh.scheduler >= 0.4.0 diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ [swh.cli.subcommands] loader=swh.loader.cli [swh.workers] + loader.arch=swh.loader.package.arch:register loader.archive=swh.loader.package.archive:register loader.cran=swh.loader.package.cran:register loader.crates=swh.loader.package.crates:register diff --git a/swh/loader/package/arch/__init__.py b/swh/loader/package/arch/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/arch/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import ArchLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": ArchLoader, + } diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/arch/loader.py @@ -0,0 +1,136 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from distutils.version import LooseVersion +from pathlib import Path +import re +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import release_name +from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class ArchPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=str) + """File last modified date as release date""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from .PKGINFO file at dir_path. + + Each Arch linux package has a .PKGINFO file at the root of the archive. + + Args: + dir_path: A directory on disk where a package has been extracted + + Returns: + A dict mapping + """ + pkginfo_path = Path(dir_path, ".PKGINFO") + rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) + with pkginfo_path.open("rb") as content: + parsed = rex.findall(content.read().decode()) + data = {entry[0].lower(): entry[1] for entry in parsed} + if "url" in data.keys(): + data["project_url"] = data["url"] + return data + + +class ArchLoader(PackageLoader[ArchPackageInfo]): + visit_type = "arch" + + def __init__( + self, + storage: StorageInterface, + url: str, + artifacts: List[Dict[str, Any]], + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: artifact for artifact in artifacts + } + + def get_versions(self) -> Sequence[str]: + """Get all released versions of an Arch Linux package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.artifacts.keys()) + versions.sort(key=LooseVersion) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of an Arch Linux package + + Returns: + A string representing a version + + Example:: + + "0.1.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, ArchPackageInfo]]: + """Get release name and package information from version + + Args: + version: arch version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + artifact = self.artifacts[version] + assert version == artifact["version"] + + p_info = ArchPackageInfo( + name=artifact["name"], + filename=artifact["filename"], + url=artifact["url"], + version=version, + last_modified=artifact["last_modified"], + ) + yield release_name(version, artifact["filename"]), p_info + + def build_release( + self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) + author = Person.from_fullname(intrinsic_metadata["packager"].encode()) + description = intrinsic_metadata["pkgdesc"] + + message = ( + f"Synthetic release for Arch Linux source package {p_info.name} " + f"version {p_info.version}\n\n" + f"{description}\n" + ) + return Release( + name=p_info.version.encode(), + author=author, + date=TimestampWithTimezone.from_iso8601(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/arch/tasks.py b/swh/loader/package/arch/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/arch/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.arch.loader import ArchLoader + + +@shared_task(name=__name__ + ".LoadArch") +def load_arch(*, url=None, artifacts: list): + """Load Arch Linux packages""" + return ArchLoader.from_configfile(url=url, artifacts=artifacts).load() diff --git a/swh/loader/package/arch/tests/__init__.py b/swh/loader/package/arch/tests/__init__.py new file mode 100644 diff --git a/swh/loader/package/arch/tests/data/fake_arch.sh b/swh/loader/package/arch/tests/data/fake_arch.sh new file mode 100755 --- /dev/null +++ b/swh/loader/package/arch/tests/data/fake_arch.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# Script to generate fake Arch Linux packages files and fake http response. + +set -euo pipefail + +# Please note that you need to install Zstandard compression tool (zstd) to compress +# to .zst archive and Xz utils (xz) to compress to .xz archive. +command -v zstd || echo "you should install 'zstd' to run this script" +command -v xz || echo "you should install 'xz' to run this script" + +# files and directories +mkdir https_archive.archlinux.org +mkdir https_uk.mirror.archlinuxarm.org +mkdir -p tmp_dir/arch/ + +cd tmp_dir/arch/ + +mkdir 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' +mkdir 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20190211-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1550046926 +packager = Evangelos Foutras +size = 455680 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +''' > packages_d_dialog_dialog-1:1.3_20190211-1-x86_64/.PKGINFO + +echo -e '''pkgname = dialog +pkgbase = dialog +pkgver = 1:1.3_20220414-1 +pkgdesc = A tool to display dialog boxes from shell scripts +url = https://invisible-island.net/dialog/ +builddate = 1650081535 +packager = Evangelos Foutras +size = 483988 +arch = x86_64 +license = LGPL2.1 +provides = libdialog.so=15-64 +depend = sh +depend = ncurses +''' > packages_d_dialog_dialog-1:1.3_20220414-1-x86_64/.PKGINFO + +# Compress packages folders to .tar.gz archives + +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz' -C 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' . +tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst' -C 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' . + +mv *.xz ../../https_archive.archlinux.org +mv *.zst ../../https_archive.archlinux.org + +# uk.mirror.archlinuxarm.org +mkdir 'aarch64_core_gzip-1.12-1-aarch64' + +echo -e '''# Generated by makepkg 6.0.1 +# using fakeroot version 1.28 +pkgname = gzip +pkgbase = gzip +pkgver = 1.12-1 +pkgdesc = GNU compression utility +url = https://www.gnu.org/software/gzip/ +builddate = 1649365694 +packager = Arch Linux ARM Build System +size = 162688 +arch = aarch64 +license = GPL3 +group = base-devel +depend = glibc +depend = bash +depend = less +''' > aarch64_core_gzip-1.12-1-aarch64/.PKGINFO + +tar --force-local -acf 'aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz' -C 'aarch64_core_gzip-1.12-1-aarch64' . + +mv *.xz ../../https_uk.mirror.archlinuxarm.org + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@" + ), + date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), + id=hash_to_bytes(expected_release_id), + ) + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[1]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): + + loader = ArchLoader( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + artifacts=EXPECTED_PACKAGES[0]["artifacts"], + ) + actual_load_status = loader.load() + assert actual_load_status["status"] == "eventful" + assert actual_load_status["snapshot_id"] is not None + + expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3" + + assert expected_snapshot_id == actual_load_status["snapshot_id"] + + expected_snapshot = Snapshot( + id=hash_to_bytes(actual_load_status["snapshot_id"]), + branches={ + b"releases/1:1.3_20190211-1/" + b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch( + target=hash_to_bytes("37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"), + target_type=TargetType.RELEASE, + ), + b"releases/1:1.3_20220414-1/" + b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch( + target=hash_to_bytes("020d3f5627df7474f257fd04f1ede4415296e265"), + target_type=TargetType.RELEASE, + ), + b"HEAD": SnapshotBranch( + target=b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + target_type=TargetType.ALIAS, + ), + }, + ) + + check_snapshot(expected_snapshot, swh_storage) + + stats = get_stats(swh_storage) + assert { + "content": 2, + "directory": 2, + "origin": 1, + "origin_visit": 1, + "release": 2, + "revision": 0, + "skipped_content": 0, + "snapshot": 1, + } == stats + + assert_last_visit_matches( + swh_storage, + url=EXPECTED_PACKAGES[0]["url"], + status="full", + type="arch", + snapshot=expected_snapshot.id, + ) + + +def test_arch_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): + url = "https://nowhere/packages/42" + loader = ArchLoader( + swh_storage, + url, + artifacts=[ + { + "filename": "42-0.0.1.pkg.xz", + "url": "https://mirror2.nowhere/pkg/42-0.0.1.pkg.xz", + "version": "0.0.1", + "arch": "aarch64", + "name": "42", + "repo": "community", + "length": 42, + "last_modified": "2022-04-07T21:08:14", + }, + ], + ) + with pytest.raises(Exception): + assert loader.load() == {"status": "failed"} + assert_last_visit_matches( + swh_storage, url, status="not_found", type="arch", snapshot=None + ) diff --git a/swh/loader/package/arch/tests/test_tasks.py b/swh/loader/package/arch/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/arch/tests/test_tasks.py @@ -0,0 +1,35 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_arch_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.arch.loader.ArchLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.arch.tasks.LoadArch", + kwargs=dict( + url="some-url/packages/s/some-package", + artifacts=[ + { + "version": "0.0.1", + "url": "https://somewhere/some-package-0.0.1.pkg.xz", + "filename": "some-package-0.0.1.pkg.xz", + "arch": "aarch64", + "name": "some-package", + "repo": "community", + "length": 42, + "last_modified": "1970-01-01T21:08:14", + } + ], + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"}