diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst index 1ec6bcc..e4135e9 100644 --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -1,151 +1,151 @@ .. _package-loader-specifications: Package loader specifications ============================= Release fields -------------- Here is an overview of the fields (+ internal version name + branch name) used by each package loader, after D6616: .. list-table:: Fields used by each package loader :header-rows: 1 * - Loader - internal version - branch name - name - message - synthetic - author - date - Notes * - arch - ``p_info.version`` - ``release_name(version, filename)`` - =version - Synthetic release for Arch Linux source package {p_info.name} version {p_info.version} {description} - true - from intrinsic metadata - - from extra_loader_arguments['artifacts'] + - from extra_loader_arguments['arch_metadata'] - Intrinsic metadata extracted from .PKGINFO file of the package * - archive - passed as arg - ``release_name(version)`` - =version - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - * - aur - ``p_info.version`` - ``release_name(version, filename)`` - =version - Synthetic release for Aur source package {p_info.name} version {p_info.version} {description} - true - "" - from extra_loader_arguments['aur_metadata'] - Intrinsic metadata extracted from .SRCINFO file of the package * - cran - ``metadata.get("Version", passed as arg)`` - ``release_name(version)`` - =version - standard message - true - ``metadata.get("Maintainer", "")`` - ``metadata.get("Date")`` - metadata is intrinsic * - crates - ``p_info.version`` - ``release_name(version, filename) + "\n\n" + i_metadata.description + "\n"`` - =version - Synthetic release for Crate source package {p_info.name} version {p_info.version} {description} - true - from int metadata - from ext metadata - ``i_metadata`` for intrinsic metadata, ``e_metadata`` for extrinsic metadata * - debian - =``version`` - ``release_name(version)`` - =``i_version`` - standard message (using ``i_version``) - true - ``metadata.changelog.person`` - ``metadata.changelog.date`` - metadata is intrinsic. Old revisions have ``dsc`` as type ``i_version`` is the intrinsic version (eg. ``0.7.2-3``) while ``version`` contains the debian suite name (eg. ``stretch/contrib/0.7.2-3``) and is passed as arg * - deposit - HEAD - only HEAD - HEAD - "{client}: Deposit {id} in collection {collection}\n" - true - original author - ``<codemeta: dateCreated>`` from SWORD XML - revisions had parents * - maven-loader - passed as arg - HEAD - ``release_name(version)`` - "Synthetic release for archive at {p_info.url}\n" - true - "" - passed as arg - Only one artefact per url (jar/zip src) * - nixguix - URL - URL - URL - None - true - "" - None - it's the URL of the artifact referenced by the derivation * - npm - ``metadata["version"]`` - ``release_name(version)`` - =version - standard message - true - from int metadata or "" - from ext metadata or None - * - opam - as given by opam - "{opam_package}.{version}" - =version - standard message - true - from metadata - None - "{self.opam_package}.{version}" matches the version names used by opam's backend. metadata is extrinsic * - pypi - ``metadata["version"]`` - ``release_name(version)`` or ``release_name(version, filename)`` - =version - ``metadata['comment_text']}`` or standard message - true - from int metadata or "" - from ext metadata or None - metadata is intrinsic using this function:: def release_name(version: str, filename: Optional[str] = None) -> str: if filename: return "releases/%s/%s" % (version, filename) return "releases/%s" % version and "standard message" being:: msg = ( f"Synthetic release for {PACKAGE_MANAGER} source package {name} " f"version {version}\n" ) The ``target_type`` field is always ``dir``, and the target the id of a directory loaded by unpacking a tarball/zip file/... diff --git a/swh/loader/package/arch/loader.py b/swh/loader/package/arch/loader.py index b96d2bf..7ab9fc2 100644 --- a/swh/loader/package/arch/loader.py +++ b/swh/loader/package/arch/loader.py @@ -1,136 +1,141 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.version import LooseVersion from pathlib import Path import re from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple import attr from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import release_name from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @attr.s class ArchPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=str) """File last modified date as release date""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from .PKGINFO file at dir_path. Each Arch linux package has a .PKGINFO file at the root of the archive. Args: dir_path: A directory on disk where a package has been extracted Returns: A dict mapping """ pkginfo_path = Path(dir_path, ".PKGINFO") rex = re.compile(r"^(\w+)\s=\s(.*)$", re.M) with pkginfo_path.open("rb") as content: parsed = rex.findall(content.read().decode()) data = {entry[0].lower(): entry[1] for entry in parsed} if "url" in data.keys(): data["project_url"] = data["url"] return data class ArchLoader(PackageLoader[ArchPackageInfo]): visit_type = "arch" def __init__( self, storage: StorageInterface, url: str, artifacts: List[Dict[str, Any]], + arch_metadata: List[Dict[str, Any]], **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url self.artifacts: Dict[str, Dict] = { artifact["version"]: artifact for artifact in artifacts } + self.arch_metadata: Dict[str, Dict] = { + metadata["version"]: metadata for metadata in arch_metadata + } def get_versions(self) -> Sequence[str]: """Get all released versions of an Arch Linux package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=LooseVersion) return versions def get_default_version(self) -> str: """Get the newest release version of an Arch Linux package Returns: A string representing a version Example:: "0.1.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, ArchPackageInfo]]: """Get release name and package information from version Args: version: arch version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] - assert version == artifact["version"] + metadata = self.arch_metadata[version] + assert version == artifact["version"] == metadata["version"] p_info = ArchPackageInfo( - name=artifact["name"], + name=metadata["name"], filename=artifact["filename"], url=artifact["url"], version=version, - last_modified=artifact["last_modified"], + last_modified=metadata["last_modified"], ) yield release_name(version, artifact["filename"]), p_info def build_release( self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) author = Person.from_fullname(intrinsic_metadata["packager"].encode()) description = intrinsic_metadata["pkgdesc"] message = ( f"Synthetic release for Arch Linux source package {p_info.name} " f"version {p_info.version}\n\n" f"{description}\n" ) return Release( name=p_info.version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/arch/tests/test_arch.py b/swh/loader/package/arch/tests/test_arch.py index 0f05d08..3180f9d 100644 --- a/swh/loader/package/arch/tests/test_arch.py +++ b/swh/loader/package/arch/tests/test_arch.py @@ -1,231 +1,253 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.loader.package.arch.loader import ArchLoader from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) EXPECTED_PACKAGES = [ { "url": "https://archive.archlinux.org/packages/d/dialog/", "artifacts": [ { "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", # noqa: B950 + "version": "1:1.3_20190211-1", + "length": 180000, + "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", + }, + { + "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 + "version": "1:1.3_20220414-1", + "length": 198000, + "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", + }, + ], + "arch_metadata": [ + { "arch": "x86_64", "repo": "core", "name": "dialog", "version": "1:1.3_20190211-1", - "length": 180000, - "filename": "dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz", "last_modified": "2019-02-13T08:36:00", }, { - "url": "https://archive.archlinux.org/packages/d/dialog/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", # noqa: B950 "arch": "x86_64", "repo": "core", "name": "dialog", "version": "1:1.3_20220414-1", - "length": 198000, - "filename": "dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", "last_modified": "2022-04-16T03:59:00", }, ], }, { "url": "https://archlinuxarm.org/packages/aarch64/gzip", "artifacts": [ { "url": "https://uk.mirror.archlinuxarm.org/aarch64/core/gzip-1.12-1-aarch64.pkg.tar.xz", # noqa: B950 + "length": 79640, + "version": "1.12-1", + "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", + } + ], + "arch_metadata": [ + { "arch": "aarch64", "name": "gzip", "repo": "core", - "length": 79640, "version": "1.12-1", - "filename": "gzip-1.12-1-aarch64.pkg.tar.xz", "last_modified": "2022-04-07T21:08:14", } ], }, ] def test_get_versions(swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[0]["url"], artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], ) assert loader.get_versions() == [ "1:1.3_20190211-1", "1:1.3_20220414-1", ] def test_get_default_version(requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[0]["url"], artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], ) assert loader.get_default_version() == "1:1.3_20220414-1" def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[1]["url"], artifacts=EXPECTED_PACKAGES[1]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[1]["arch_metadata"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4" expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch( target=hash_to_bytes(expected_release_id), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 1, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get([hash_to_bytes(expected_release_id)])[0] == Release( name=b"1.12-1", message=b"Synthetic release for Arch Linux source package gzip version " b"1.12-1\n\nGNU compression utility\n", target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname( b"Arch Linux ARM Build System <builder+seattle@archlinuxarm.org>" ), date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), id=hash_to_bytes(expected_release_id), ) assert_last_visit_matches( swh_storage, url=EXPECTED_PACKAGES[1]["url"], status="full", type="arch", snapshot=expected_snapshot.id, ) def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[0]["url"], artifacts=EXPECTED_PACKAGES[0]["artifacts"], + arch_metadata=EXPECTED_PACKAGES[0]["arch_metadata"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/1:1.3_20190211-1/" b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch( target=hash_to_bytes("37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"), target_type=TargetType.RELEASE, ), b"releases/1:1.3_20220414-1/" b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch( target=hash_to_bytes("020d3f5627df7474f257fd04f1ede4415296e265"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 2, "directory": 2, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches( swh_storage, url=EXPECTED_PACKAGES[0]["url"], status="full", type="arch", snapshot=expected_snapshot.id, ) def test_arch_invalid_origin_archive_not_found(swh_storage, requests_mock_datadir): url = "https://nowhere/packages/42" loader = ArchLoader( swh_storage, url, artifacts=[ { "filename": "42-0.0.1.pkg.xz", "url": "https://mirror2.nowhere/pkg/42-0.0.1.pkg.xz", + "version": "0.0.1", + "length": 42, + }, + ], + arch_metadata=[ + { "version": "0.0.1", "arch": "aarch64", "name": "42", "repo": "community", - "length": 42, "last_modified": "2022-04-07T21:08:14", }, ], ) with pytest.raises(Exception): assert loader.load() == {"status": "failed"} assert_last_visit_matches( swh_storage, url, status="not_found", type="arch", snapshot=None ) diff --git a/swh/loader/package/arch/tests/test_tasks.py b/swh/loader/package/arch/tests/test_tasks.py index 15d7ba3..b5178ac 100644 --- a/swh/loader/package/arch/tests/test_tasks.py +++ b/swh/loader/package/arch/tests/test_tasks.py @@ -1,35 +1,40 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def test_tasks_arch_loader( mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config ): mock_load = mocker.patch("swh.loader.package.arch.loader.ArchLoader.load") mock_load.return_value = {"status": "eventful"} res = swh_scheduler_celery_app.send_task( "swh.loader.package.arch.tasks.LoadArch", kwargs=dict( url="some-url/packages/s/some-package", artifacts=[ { "version": "0.0.1", "url": "https://somewhere/some-package-0.0.1.pkg.xz", "filename": "some-package-0.0.1.pkg.xz", + "length": 42, + } + ], + arch_metadata=[ + { + "version": "0.0.1", "arch": "aarch64", "name": "some-package", "repo": "community", - "length": 42, "last_modified": "1970-01-01T21:08:14", } ], ), ) assert res res.wait() assert res.successful() assert mock_load.called assert res.result == {"status": "eventful"}