diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst --- a/docs/package-loader-specifications.rst +++ b/docs/package-loader-specifications.rst @@ -47,6 +47,15 @@ - "" - from extra_loader_arguments['aur_metadata'] - Intrinsic metadata extracted from .SRCINFO file of the package + * - cpan + - ``p_info.​version`` + - ``release_name(​version)`` + - =version + - Synthetic release for Perl source package {name} version {version} {description} + - true + - from intrinsic metadata if any else from extrinsic + - from extrinsic metadata + - name, version and description from intrinsic metadata * - cran - ``metadata.get(​"Version", passed as arg)`` - ``release_name(​version)`` diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ loader.arch=swh.loader.package.arch:register loader.archive=swh.loader.package.archive:register loader.aur=swh.loader.package.aur:register + loader.cpan=swh.loader.package.cpan:register loader.cran=swh.loader.package.cran:register loader.crates=swh.loader.package.crates:register loader.debian=swh.loader.package.debian:register diff --git a/swh/loader/package/cpan/__init__.py b/swh/loader/package/cpan/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cpan/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from typing import Any, Mapping + + +def register() -> Mapping[str, Any]: + """Register the current worker module's definition""" + from .loader import CpanLoader + + return { + "task_modules": [f"{__name__}.tasks"], + "loader": CpanLoader, + } diff --git a/swh/loader/package/cpan/loader.py b/swh/loader/package/cpan/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cpan/loader.py @@ -0,0 +1,196 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +import json +from pathlib import Path +from typing import Any, Dict, Iterator, Optional, Sequence, Tuple + +import attr +import iso8601 +from packaging.version import parse as parse_version +import yaml + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.loader.package.utils import ( + EMPTY_AUTHOR, + Person, + cached_method, + get_url_body, + release_name, +) +from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone +from swh.storage.interface import StorageInterface + + +@attr.s +class CpanPackageInfo(BasePackageInfo): + + name = attr.ib(type=str) + """Name of the package""" + + filename = attr.ib(type=str) + """Archive (tar.gz) file name""" + + version = attr.ib(type=str) + """Current version""" + + last_modified = attr.ib(type=datetime) + """File last modified date as release date.""" + + author = attr.ib(type=Person) + """Author""" + + +def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: + """Extract intrinsic metadata from META.json file at dir_path. + + Each Perl package version has a META.json file at the root of the archive, + or a META.yml for older version. + + See https://perldoc.perl.org/CPAN::Meta for META specifications. + + Args: + dir_path: A directory on disk where a META.json|.yml can be found + + Returns: + A dict mapping from yaml parser + """ + meta_json_path = dir_path / "META.json" + metadata: Dict[str, Any] = {} + if meta_json_path.exists(): + metadata = json.loads(meta_json_path.read_text()) + + meta_yml_path = dir_path / "META.yml" + if meta_yml_path.exists(): + metadata = yaml.safe_load(meta_yml_path.read_text()) + + return metadata + + +class CpanLoader(PackageLoader[CpanPackageInfo]): + visit_type = "cpan" + + def __init__( + self, + storage: StorageInterface, + url: str, + **kwargs, + ): + + super().__init__(storage=storage, url=url, **kwargs) + self.url = url + + @cached_method + def info_versions(self) -> Dict: + """Return the package versions (fetched from + ``https://fastapi.metacpan.org/v1/release/versions/{pkgname}``) + + Api documentation https://cpan.haskell.org/api + """ + pkgname = self.url.split("/")[-1] + url = f"https://fastapi.metacpan.org/v1/release/versions/{pkgname}" + data = json.loads(get_url_body(url=url, headers={"Accept": "application/json"})) + return {release["version"]: release for release in data["releases"]} + + def get_versions(self) -> Sequence[str]: + """Get all released versions of a Perl package + + Returns: + A sequence of versions + + Example:: + + ["0.1.1", "0.10.2"] + """ + versions = list(self.info_versions().keys()) + versions.sort(key=parse_version) + return versions + + def get_default_version(self) -> str: + """Get the newest release version of a Perl package + + Returns: + A string representing a version + + Example:: + + "0.10.2" + """ + return self.get_versions()[-1] + + def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: + """Get release name and package information from version + + Args: + version: Package version (e.g: "0.1.0") + + Returns: + Iterator of tuple (release_name, p_info) + """ + data = self.info_versions()[version] + pkgname: str = self.url.split("/")[-1] + url: str = data["download_url"] + filename: str = url.split("/")[-1] + # The api does not provide an explicit timezone, defaults to UTC + last_modified = iso8601.parse_date(data["date"]) + + if "author" in data: + author = Person.from_fullname(data["author"].encode()) + else: + author = EMPTY_AUTHOR + + p_info = CpanPackageInfo( + name=pkgname, + filename=filename, + url=url, + version=version, + last_modified=last_modified, + author=author, + ) + yield release_name(version), p_info + + def build_release( + self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git + ) -> Optional[Release]: + + # Extract intrinsic metadata from uncompressed_path/META.json|.yml + intrinsic_metadata = extract_intrinsic_metadata( + Path(uncompressed_path) / f"{p_info.name}-{p_info.version}" + ) + + name: str = intrinsic_metadata["name"] + assert name == p_info.name + version: str = str(intrinsic_metadata["version"]) + assert version == p_info.version + + description = intrinsic_metadata["abstract"] + + # author data from http endpoint are less complete than from META + if "author" in intrinsic_metadata: + author_data = intrinsic_metadata["author"] + if type(author_data) is list: + author = author_data[0] + else: + author = author_data + author = Person.from_fullname(author.encode()) + else: + author = p_info.author + + message = ( + f"Synthetic release for Perl source package {name} " + f"version {version}\n\n" + f"{description}\n" + ) + + return Release( + name=version.encode(), + author=author, + date=TimestampWithTimezone.from_datetime(p_info.last_modified), + message=message.encode(), + target_type=ObjectType.DIRECTORY, + target=directory, + synthetic=True, + ) diff --git a/swh/loader/package/cpan/tasks.py b/swh/loader/package/cpan/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cpan/tasks.py @@ -0,0 +1,14 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.loader.package.cpan.loader import CpanLoader + + +@shared_task(name=__name__ + ".LoadCpan") +def load_cpan(**kwargs): + """Load packages from Cpan (The Comprehensive Perl Archive Network)""" + return CpanLoader.from_configfile(**kwargs).load() diff --git a/swh/loader/package/cpan/tests/__init__.py b/swh/loader/package/cpan/tests/__init__.py new file mode 100644 diff --git a/swh/loader/package/cpan/tests/data/fake_cpan.sh b/swh/loader/package/cpan/tests/data/fake_cpan.sh new file mode 100644 --- /dev/null +++ b/swh/loader/package/cpan/tests/data/fake_cpan.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# Script to generate fake Perl package archives as .tar.gz. + +set -euo pipefail + +# Create directories +readonly TMP=tmp_dir/cpan +readonly BASE_PATH=https_cpan.metacpan.org + +mkdir -p $TMP + +# tar.gz package archives +# Perl package tar.gz archive needs at least one directory with a META.json or META.yml file +mkdir -p ${TMP}/Internals-CountObjects-0.01 +mkdir -p ${TMP}/Internals-CountObjects-0.05 +mkdir -p $BASE_PATH + +echo -e """--- +abstract: 'Report all allocated perl objects' +author: + - 'Josh Jore ' +build_requires: {} +configure_requires: + ExtUtils::MakeMaker: 6.31 +dynamic_config: 0 +generated_by: 'Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 +name: Internals-CountObjects +version: 0.01 +""" > ${TMP}/Internals-CountObjects-0.01/META.yml + +echo -e '''{ + "abstract" : "Report all allocated perl objects", + "author" : [ + "Josh Jore " + ], + "dynamic_config" : 0, + "generated_by" : "Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "Internals-CountObjects", + "prereqs" : { + "build" : { + "requires" : { + "ExtUtils::CBuilder" : 0 + } + } + }, + "release_status" : "stable", + "resources" : { + "bugtracker" : { + "mailto" : "bug-Internals-CountObjects@rt.cpan.org", + "web" : "http://rt.cpan.org/NoAuth/Bugs.html?Dist=Internals-CountObjects" + }, + "homepage" : "http://search.cpan.org/dist/Internals-CountObjects", + "repository" : { + "type" : "git", + "url" : "git://github.com/jbenjore/Internals-CountObjects.git", + "web" : "http://github.com/jbenjore/Internals-CountObjects" + } + }, + "version" : "0.05" +} +''' > ${TMP}/Internals-CountObjects-0.05/META.json + +cd $TMP + +# Tar compress +tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz Internals-CountObjects-0.01 +tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.05.tar.gz Internals-CountObjects-0.05 + +# Move .tar.gz archives to a servable directory +mv *.tar.gz ../../$BASE_PATH + +# Clean up removing tmp_dir +cd ../../ +rm -r tmp_dir/ diff --git a/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz b/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@", + name=b"Josh Jore", + email=b"jjore@cpan.org", + ), + date=TimestampWithTimezone.from_iso8601("2011-06-11T05:23:31+00:00"), + id=hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e"), + ) + + assert_last_visit_matches( + swh_storage, + url=ORIGINS[0], + status="full", + type="cpan", + snapshot=expected_snapshot.id, + ) diff --git a/swh/loader/package/cpan/tests/test_tasks.py b/swh/loader/package/cpan/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/cpan/tests/test_tasks.py @@ -0,0 +1,23 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def test_tasks_cpan_loader( + mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config +): + mock_load = mocker.patch("swh.loader.package.cpan.loader.CpanLoader.load") + mock_load.return_value = {"status": "eventful"} + + res = swh_scheduler_celery_app.send_task( + "swh.loader.package.cpan.tasks.LoadCpan", + kwargs=dict( + url="some-url/api/packages/some-package", + ), + ) + assert res + res.wait() + assert res.successful() + assert mock_load.called + assert res.result == {"status": "eventful"}