Page MenuHomeSoftware Heritage

D7995.diff
No OneTemporary

D7995.diff

diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst
--- a/docs/package-loader-specifications.rst
+++ b/docs/package-loader-specifications.rst
@@ -20,6 +20,15 @@
- author
- date
- Notes
+ * - arch
+ - ``p_info.​version``
+ - ``release_name(​version, filename)``
+ - =version
+ - Synthetic release for Arch Linux source package {p_info.name} version {p_info.version} {description}
+ - true
+ - from intrinsic metadata
+ - from extra_loader_arguments['artifacts']
+ - Intrinsic metadata extracted from .PKGINFO file of the package
* - archive
- passed as arg
- ``release_name(​version)``
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
-swh.core >= 0.3
+swh.core >= 2.12
swh.model >= 4.4.0
swh.objstorage >= 0.2.2
swh.scheduler >= 0.4.0
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,7 @@
[swh.cli.subcommands]
loader=swh.loader.cli
[swh.workers]
+ loader.arch=swh.loader.package.arch:register
loader.archive=swh.loader.package.archive:register
loader.cran=swh.loader.package.cran:register
loader.crates=swh.loader.package.crates:register
diff --git a/swh/loader/package/arch/__init__.py b/swh/loader/package/arch/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/arch/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from typing import Any, Mapping
+
+
+def register() -> Mapping[str, Any]:
+ """Register the current worker module's definition"""
+ from .loader import ArchLoader
+
+ return {
+ "task_modules": [f"{__name__}.tasks"],
+ "loader": ArchLoader,
+ }
diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/arch/loader.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from distutils.version import LooseVersion
+from pathlib import Path
+import re
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
+
+import attr
+
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
+from swh.loader.package.utils import release_name
+from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone
+from swh.storage.interface import StorageInterface
+
+
+@attr.s
+class ArchPackageInfo(BasePackageInfo):
+
+ name = attr.ib(type=str)
+ """Name of the package"""
+
+ version = attr.ib(type=str)
+ """Current version"""
+
+ last_modified = attr.ib(type=str)
+ """File last modified date as release date"""
+
+
+def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]:
+ """Extract intrinsic metadata from .PKGINFO file at dir_path.
+
+ Each Arch linux package has a .PKGINFO file at the root of the archive.
+
+ Args:
+ dir_path: A directory on disk where a package has been extracted
+
+ Returns:
+ A dict mapping
+ """
+ pkginfo_path = Path(dir_path, ".PKGINFO")
+ rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M)
+ with pkginfo_path.open("rb") as content:
+ parsed = rex.findall(content.read().decode())
+ data = {entry[0].lower(): entry[1] for entry in parsed}
+ if "url" in data.keys():
+ data["project_url"] = data["url"]
+ return data
+
+
+class ArchLoader(PackageLoader[ArchPackageInfo]):
+ visit_type = "arch"
+
+ def __init__(
+ self,
+ storage: StorageInterface,
+ url: str,
+ artifacts: List[Dict[str, Any]],
+ **kwargs,
+ ):
+
+ super().__init__(storage=storage, url=url, **kwargs)
+ self.url = url
+ self.artifacts: Dict[str, Dict] = {
+ artifact["version"]: artifact for artifact in artifacts
+ }
+
+ def get_versions(self) -> Sequence[str]:
+ """Get all released versions of an Arch Linux package
+
+ Returns:
+ A sequence of versions
+
+ Example::
+
+ ["0.1.1", "0.10.2"]
+ """
+ versions = list(self.artifacts.keys())
+ versions.sort(key=LooseVersion)
+ return versions
+
+ def get_default_version(self) -> str:
+ """Get the newest release version of an Arch Linux package
+
+ Returns:
+ A string representing a version
+
+ Example::
+
+ "0.1.2"
+ """
+ return self.get_versions()[-1]
+
+ def get_package_info(self, version: str) -> Iterator[Tuple[str, ArchPackageInfo]]:
+ """Get release name and package information from version
+
+ Args:
+ version: arch version (e.g: "0.1.0")
+
+ Returns:
+ Iterator of tuple (release_name, p_info)
+ """
+ artifact = self.artifacts[version]
+ assert version == artifact["version"]
+
+ p_info = ArchPackageInfo(
+ name=artifact["name"],
+ filename=artifact["filename"],
+ url=artifact["url"],
+ version=version,
+ last_modified=artifact["last_modified"],
+ )
+ yield release_name(version, artifact["filename"]), p_info
+
+ def build_release(
+ self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git
+ ) -> Optional[Release]:
+ intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path))
+ author = Person.from_fullname(intrinsic_metadata["packager"].encode())
+ description = intrinsic_metadata["pkgdesc"]
+
+ message = (
+ f"Synthetic release for Arch Linux source package {p_info.name} "
+ f"version {p_info.version}\n\n"
+ f"{description}\n"
+ )
+ return Release(
+ name=p_info.version.encode(),
+ author=author,
+ date=TimestampWithTimezone.from_iso8601(p_info.last_modified),
+ message=message.encode(),
+ target_type=ObjectType.DIRECTORY,
+ target=directory,
+ synthetic=True,
+ )
diff --git a/swh/loader/package/arch/tasks.py b/swh/loader/package/arch/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/arch/tasks.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.loader.package.arch.loader import ArchLoader
+
+
+@shared_task(name=__name__ + ".LoadArch")
+def load_arch(*, url=None, artifacts: list):
+ """Load Arch Linux packages"""
+ return ArchLoader.from_configfile(url=url, artifacts=artifacts).load()
diff --git a/swh/loader/package/arch/tests/__init__.py b/swh/loader/package/arch/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/package/arch/tests/data/fake_arch.sh b/swh/loader/package/arch/tests/data/fake_arch.sh
new file mode 100755
--- /dev/null
+++ b/swh/loader/package/arch/tests/data/fake_arch.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+# Script to generate fake Arch Linux packages files and fake http response.
+
+set -euo pipefail
+
+# Please note that you need to install Zstandard compression tool (zstd) to compress
+# to .zst archive and Xz utils (xz) to compress to .xz archive.
+command -v zstd || echo "you should install 'zstd' to run this script"
+command -v xz || echo "you should install 'xz' to run this script"
+
+# files and directories
+mkdir https_archive.archlinux.org
+mkdir https_uk.mirror.archlinuxarm.org
+mkdir -p tmp_dir/arch/
+
+cd tmp_dir/arch/
+
+mkdir 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64'
+mkdir 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64'
+
+echo -e '''pkgname = dialog
+pkgbase = dialog
+pkgver = 1:1.3_20190211-1
+pkgdesc = A tool to display dialog boxes from shell scripts
+url = https://invisible-island.net/dialog/
+builddate = 1550046926
+packager = Evangelos Foutras <evangelos@foutrelis.com>
+size = 455680
+arch = x86_64
+license = LGPL2.1
+provides = libdialog.so=15-64
+depend = sh
+''' > packages_d_dialog_dialog-1:1.3_20190211-1-x86_64/.PKGINFO
+
+echo -e '''pkgname = dialog
+pkgbase = dialog
+pkgver = 1:1.3_20220414-1
+pkgdesc = A tool to display dialog boxes from shell scripts
+url = https://invisible-island.net/dialog/
+builddate = 1650081535
+packager = Evangelos Foutras <foutrelis@archlinux.org>
+size = 483988
+arch = x86_64
+license = LGPL2.1
+provides = libdialog.so=15-64
+depend = sh
+depend = ncurses
+''' > packages_d_dialog_dialog-1:1.3_20220414-1-x86_64/.PKGINFO
+
+# Compress packages folders to .tar.gz archives
+
+tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz' -C 'packages_d_dialog_dialog-1:1.3_20190211-1-x86_64' .
+tar --force-local -acf 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst' -C 'packages_d_dialog_dialog-1:1.3_20220414-1-x86_64' .
+
+mv *.xz ../../https_archive.archlinux.org
+mv *.zst ../../https_archive.archlinux.org
+
+# uk.mirror.archlinuxarm.org
+mkdir 'aarch64_core_gzip-1.12-1-aarch64'
+
+echo -e '''# Generated by makepkg 6.0.1
+# using fakeroot version 1.28
+pkgname = gzip
+pkgbase = gzip
+pkgver = 1.12-1
+pkgdesc = GNU compression utility
+url = https://www.gnu.org/software/gzip/
+builddate = 1649365694
+packager = Arch Linux ARM Build System <builder+seattle@archlinuxarm.org>
+size = 162688
+arch = aarch64
+license = GPL3
+group = base-devel
+depend = glibc
+depend = bash
+depend = less
+''' > aarch64_core_gzip-1.12-1-aarch64/.PKGINFO
+
+tar --force-local -acf 'aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz' -C 'aarch64_core_gzip-1.12-1-aarch64' .
+
+mv *.xz ../../https_uk.mirror.archlinuxarm.org
+
+# Clean up removing tmp_dir
+cd ../../
+rm -r tmp_dir/
diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst b/swh/loader/package/arch/tests/data/https_archive.archlinux.org/packages_d_dialog_dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz b/swh/loader/package/arch/tests/data/https_uk.mirror.archlinuxarm.org/aarch64_core_gzip-1.12-1-aarch64.pkg.tar.xz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/arch/tests/test_arch.py b/swh/loader/package/arch/tests/test_arch.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/arch/tests/test_arch.py
@@ -0,0 +1,231 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import pytest
+
+from swh.loader.package.arch.loader import ArchLoader
+from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import (
+ ObjectType,
+ Person,
+ Release,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ TimestampWithTimezone,
+)
+
+EXPECTED_PACKAGES = [
+ {
+ "url": "https://archive.archlinux.org/packages/d/dialog/",
+ "artifacts": [
+ {
+ "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950
+ "arch": "x86_64",
+ "repo": "core",
+ "name": "dialog",
+ "version": "1:1.3_20190211-1",
+ "length": 180000,
+ "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz",
+ "last_modified": "2019-02-13T08:36:00",
+ },
+ {
+ "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950
+ "arch": "x86_64",
+ "repo": "core",
+ "name": "dialog",
+ "version": "1:1.3_20220414-1",
+ "length": 198000,
+ "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst",
+ "last_modified": "2022-04-16T03:59:00",
+ },
+ ],
+ },
+ {
+ "url": "https://archlinuxarm.org/packages/aarch64/gzip",
+ "artifacts": [
+ {
+ "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950
+ "arch": "aarch64",
+ "name": "gzip",
+ "repo": "core",
+ "length": 79640,
+ "version": "1.12-1",
+ "filename": "gzip-1.12-1-aarch64.pkg.tar.xz",
+ "last_modified": "2022-04-07T21:08:14",
+ }
+ ],
+ },
+]
+
+
+def test_get_versions(swh_storage):
+ loader = ArchLoader(
+ swh_storage,
+ url=EXPECTED_PACKAGES[0]["url"],
+ artifacts=EXPECTED_PACKAGES[0]["artifacts"],
+ )
+
+ assert loader.get_versions() == [
+ "1:1.3_20190211-1",
+ "1:1.3_20220414-1",
+ ]
+
+
+def test_get_default_version(requests_mock_datadir, swh_storage):
+ loader = ArchLoader(
+ swh_storage,
+ url=EXPECTED_PACKAGES[0]["url"],
+ artifacts=EXPECTED_PACKAGES[0]["artifacts"],
+ )
+ assert loader.get_default_version() == "1:1.3_20220414-1"
+
+
+def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage):
+ loader = ArchLoader(
+ swh_storage,
+ url=EXPECTED_PACKAGES[1]["url"],
+ artifacts=EXPECTED_PACKAGES[1]["artifacts"],
+ )
+ actual_load_status = loader.load()
+ assert actual_load_status["status"] == "eventful"
+ assert actual_load_status["snapshot_id"] is not None
+
+ expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4"
+ expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc"
+
+ assert expected_snapshot_id == actual_load_status["snapshot_id"]
+
+ expected_snapshot = Snapshot(
+ id=hash_to_bytes(actual_load_status["snapshot_id"]),
+ branches={
+ b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch(
+ target=hash_to_bytes(expected_release_id),
+ target_type=TargetType.RELEASE,
+ ),
+ b"HEAD": SnapshotBranch(
+ target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz",
+ target_type=TargetType.ALIAS,
+ ),
+ },
+ )
+ check_snapshot(expected_snapshot, swh_storage)
+
+ stats = get_stats(swh_storage)
+ assert {
+ "content": 1,
+ "directory": 1,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 1,
+ "revision": 0,
+ "skipped_content": 0,
+ "snapshot": 1,
+ } == stats
+
+ assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release(
+ name=b"1.12-1",
+ message=b"Synthetic release for Arch Linux source package gzip version "
+ b"1.12-1\n\nGNU compression utility\n",
+ target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"),
+ target_type=ObjectType.DIRECTORY,
+ synthetic=True,
+ author=Person.from_fullname(
+ b"Arch Linux ARM Build System <builder+seattle@archlinuxarm.org>"
+ ),
+ date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"),
+ id=hash_to_bytes(expected_release_id),
+ )
+
+ assert_last_visit_matches(
+ swh_storage,
+ url=EXPECTED_PACKAGES[1]["url"],
+ status="full",
+ type="arch",
+ snapshot=expected_snapshot.id,
+ )
+
+
+def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage):
+
+ loader = ArchLoader(
+ swh_storage,
+ url=EXPECTED_PACKAGES[0]["url"],
+ artifacts=EXPECTED_PACKAGES[0]["artifacts"],
+ )
+ actual_load_status = loader.load()
+ assert actual_load_status["status"] == "eventful"
+ assert actual_load_status["snapshot_id"] is not None
+
+ expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3"
+
+ assert expected_snapshot_id == actual_load_status["snapshot_id"]
+
+ expected_snapshot = Snapshot(
+ id=hash_to_bytes(actual_load_status["snapshot_id"]),
+ branches={
+ b"releases/1:1.3_20190211-1/"
+ b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch(
+ target=hash_to_bytes("37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"),
+ target_type=TargetType.RELEASE,
+ ),
+ b"releases/1:1.3_20220414-1/"
+ b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch(
+ target=hash_to_bytes("020d3f5627df7474f257fd04f1ede4415296e265"),
+ target_type=TargetType.RELEASE,
+ ),
+ b"HEAD": SnapshotBranch(
+ target=b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst",
+ target_type=TargetType.ALIAS,
+ ),
+ },
+ )
+
+ check_snapshot(expected_snapshot, swh_storage)
+
+ stats = get_stats(swh_storage)
+ assert {
+ "content": 2,
+ "directory": 2,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 2,
+ "revision": 0,
+ "skipped_content": 0,
+ "snapshot": 1,
+ } == stats
+
+ assert_last_visit_matches(
+ swh_storage,
+ url=EXPECTED_PACKAGES[0]["url"],
+ status="full",
+ type="arch",
+ snapshot=expected_snapshot.id,
+ )
+
+
+def test_arch_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir):
+ url = "https://nowhere/packages/42"
+ loader = ArchLoader(
+ swh_storage,
+ url,
+ artifacts=[
+ {
+ "filename": "42-0.0.1.pkg.xz",
+ "url": "https://mirror2.nowhere/pkg/42-0.0.1.pkg.xz",
+ "version": "0.0.1",
+ "arch": "aarch64",
+ "name": "42",
+ "repo": "community",
+ "length": 42,
+ "last_modified": "2022-04-07T21:08:14",
+ },
+ ],
+ )
+ with pytest.raises(Exception):
+ assert loader.load() == {"status": "failed"}
+ assert_last_visit_matches(
+ swh_storage, url, status="not_found", type="arch", snapshot=None
+ )
diff --git a/swh/loader/package/arch/tests/test_tasks.py b/swh/loader/package/arch/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/arch/tests/test_tasks.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_tasks_arch_loader(
+ mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config
+):
+ mock_load = mocker.patch("swh.loader.package.arch.loader.ArchLoader.load")
+ mock_load.return_value = {"status": "eventful"}
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.loader.package.arch.tasks.LoadArch",
+ kwargs=dict(
+ url="some-url/packages/s/some-package",
+ artifacts=[
+ {
+ "version": "0.0.1",
+ "url": "https://somewhere/some-package-0.0.1.pkg.xz",
+ "filename": "some-package-0.0.1.pkg.xz",
+ "arch": "aarch64",
+ "name": "some-package",
+ "repo": "community",
+ "length": 42,
+ "last_modified": "1970-01-01T21:08:14",
+ }
+ ],
+ ),
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+ assert mock_load.called
+ assert res.result == {"status": "eventful"}

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 19, 1:40 PM (19 h, 33 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220090

Event Timeline