Page MenuHomeSoftware Heritage

D8557.diff
No OneTemporary

D8557.diff

diff --git a/docs/package-loader-specifications.rst b/docs/package-loader-specifications.rst
--- a/docs/package-loader-specifications.rst
+++ b/docs/package-loader-specifications.rst
@@ -47,6 +47,15 @@
- ""
- from extra_loader_arguments['aur_metadata']
- Intrinsic metadata extracted from .SRCINFO file of the package
+ * - cpan
+ - ``p_info.​version``
+ - ``release_name(​version)``
+ - =version
+ - Synthetic release for Perl source package {name} version {version} {description}
+ - true
+ - from intrinsic metadata if any else from extrinsic
+ - from extrinsic metadata
+ - name, version and description from intrinsic metadata
* - cran
- ``metadata.get(​"Version", passed as arg)``
- ``release_name(​version)``
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
loader.arch=swh.loader.package.arch:register
loader.archive=swh.loader.package.archive:register
loader.aur=swh.loader.package.aur:register
+ loader.cpan=swh.loader.package.cpan:register
loader.cran=swh.loader.package.cran:register
loader.crates=swh.loader.package.crates:register
loader.debian=swh.loader.package.debian:register
diff --git a/swh/loader/package/cpan/__init__.py b/swh/loader/package/cpan/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from typing import Any, Mapping
+
+
+def register() -> Mapping[str, Any]:
+ """Register the current worker module's definition"""
+ from .loader import CpanLoader
+
+ return {
+ "task_modules": [f"{__name__}.tasks"],
+ "loader": CpanLoader,
+ }
diff --git a/swh/loader/package/cpan/loader.py b/swh/loader/package/cpan/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/loader.py
@@ -0,0 +1,196 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+import json
+from pathlib import Path
+from typing import Any, Dict, Iterator, Optional, Sequence, Tuple
+
+import attr
+import iso8601
+from packaging.version import parse as parse_version
+import yaml
+
+from swh.loader.package.loader import BasePackageInfo, PackageLoader
+from swh.loader.package.utils import (
+ EMPTY_AUTHOR,
+ Person,
+ cached_method,
+ get_url_body,
+ release_name,
+)
+from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone
+from swh.storage.interface import StorageInterface
+
+
+@attr.s
+class CpanPackageInfo(BasePackageInfo):
+
+ name = attr.ib(type=str)
+ """Name of the package"""
+
+ filename = attr.ib(type=str)
+ """Archive (tar.gz) file name"""
+
+ version = attr.ib(type=str)
+ """Current version"""
+
+ last_modified = attr.ib(type=datetime)
+ """File last modified date as release date."""
+
+ author = attr.ib(type=Person)
+ """Author"""
+
+
+def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]:
+ """Extract intrinsic metadata from META.json file at dir_path.
+
+ Each Perl package version has a META.json file at the root of the archive,
+ or a META.yml for older version.
+
+ See https://perldoc.perl.org/CPAN::Meta for META specifications.
+
+ Args:
+ dir_path: A directory on disk where a META.json|.yml can be found
+
+ Returns:
+ A dict mapping from yaml parser
+ """
+ meta_json_path = dir_path / "META.json"
+ metadata: Dict[str, Any] = {}
+ if meta_json_path.exists():
+ metadata = json.loads(meta_json_path.read_text())
+
+ meta_yml_path = dir_path / "META.yml"
+ if meta_yml_path.exists():
+ metadata = yaml.safe_load(meta_yml_path.read_text())
+
+ return metadata
+
+
+class CpanLoader(PackageLoader[CpanPackageInfo]):
+ visit_type = "cpan"
+
+ def __init__(
+ self,
+ storage: StorageInterface,
+ url: str,
+ **kwargs,
+ ):
+
+ super().__init__(storage=storage, url=url, **kwargs)
+ self.url = url
+
+ @cached_method
+ def info_versions(self) -> Dict:
+ """Return the package versions (fetched from
+ ``https://fastapi.metacpan.org/v1/release/versions/{pkgname}``)
+
+ Api documentation https://cpan.haskell.org/api
+ """
+ pkgname = self.url.split("/")[-1]
+ url = f"https://fastapi.metacpan.org/v1/release/versions/{pkgname}"
+ data = json.loads(get_url_body(url=url, headers={"Accept": "application/json"}))
+ return {release["version"]: release for release in data["releases"]}
+
+ def get_versions(self) -> Sequence[str]:
+ """Get all released versions of a Perl package
+
+ Returns:
+ A sequence of versions
+
+ Example::
+
+ ["0.1.1", "0.10.2"]
+ """
+ versions = list(self.info_versions().keys())
+ versions.sort(key=parse_version)
+ return versions
+
+ def get_default_version(self) -> str:
+ """Get the newest release version of a Perl package
+
+ Returns:
+ A string representing a version
+
+ Example::
+
+ "0.10.2"
+ """
+ return self.get_versions()[-1]
+
+ def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]:
+ """Get release name and package information from version
+
+ Args:
+ version: Package version (e.g: "0.1.0")
+
+ Returns:
+ Iterator of tuple (release_name, p_info)
+ """
+ data = self.info_versions()[version]
+ pkgname: str = self.url.split("/")[-1]
+ url: str = data["download_url"]
+ filename: str = url.split("/")[-1]
+ # The api does not provide an explicit timezone, defaults to UTC
+ last_modified = iso8601.parse_date(data["date"])
+
+ if "author" in data:
+ author = Person.from_fullname(data["author"].encode())
+ else:
+ author = EMPTY_AUTHOR
+
+ p_info = CpanPackageInfo(
+ name=pkgname,
+ filename=filename,
+ url=url,
+ version=version,
+ last_modified=last_modified,
+ author=author,
+ )
+ yield release_name(version), p_info
+
+ def build_release(
+ self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git
+ ) -> Optional[Release]:
+
+ # Extract intrinsic metadata from uncompressed_path/META.json|.yml
+ intrinsic_metadata = extract_intrinsic_metadata(
+ Path(uncompressed_path) / f"{p_info.name}-{p_info.version}"
+ )
+
+ name: str = intrinsic_metadata["name"]
+ assert name == p_info.name
+ version: str = str(intrinsic_metadata["version"])
+ assert version == p_info.version
+
+ description = intrinsic_metadata["abstract"]
+
+ # author data from http endpoint are less complete than from META
+ if "author" in intrinsic_metadata:
+ author_data = intrinsic_metadata["author"]
+ if type(author_data) is list:
+ author = author_data[0]
+ else:
+ author = author_data
+ author = Person.from_fullname(author.encode())
+ else:
+ author = p_info.author
+
+ message = (
+ f"Synthetic release for Perl source package {name} "
+ f"version {version}\n\n"
+ f"{description}\n"
+ )
+
+ return Release(
+ name=version.encode(),
+ author=author,
+ date=TimestampWithTimezone.from_datetime(p_info.last_modified),
+ message=message.encode(),
+ target_type=ObjectType.DIRECTORY,
+ target=directory,
+ synthetic=True,
+ )
diff --git a/swh/loader/package/cpan/tasks.py b/swh/loader/package/cpan/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/tasks.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.loader.package.cpan.loader import CpanLoader
+
+
+@shared_task(name=__name__ + ".LoadCpan")
+def load_cpan(**kwargs):
+ """Load packages from Cpan (The Comprehensive Perl Archive Network)"""
+ return CpanLoader.from_configfile(**kwargs).load()
diff --git a/swh/loader/package/cpan/tests/__init__.py b/swh/loader/package/cpan/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/package/cpan/tests/data/fake_cpan.sh b/swh/loader/package/cpan/tests/data/fake_cpan.sh
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/tests/data/fake_cpan.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+# Script to generate fake Perl package archives as .tar.gz.
+
+set -euo pipefail
+
+# Create directories
+readonly TMP=tmp_dir/cpan
+readonly BASE_PATH=https_cpan.metacpan.org
+
+mkdir -p $TMP
+
+# tar.gz package archives
+# Perl package tar.gz archive needs at least one directory with a META.json or META.yml file
+mkdir -p ${TMP}/Internals-CountObjects-0.01
+mkdir -p ${TMP}/Internals-CountObjects-0.05
+mkdir -p $BASE_PATH
+
+echo -e """---
+abstract: 'Report all allocated perl objects'
+author:
+ - 'Josh Jore <jjore@cpan.org>'
+build_requires: {}
+configure_requires:
+ ExtUtils::MakeMaker: 6.31
+dynamic_config: 0
+generated_by: 'Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400'
+license: perl
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: 1.4
+name: Internals-CountObjects
+version: 0.01
+""" > ${TMP}/Internals-CountObjects-0.01/META.yml
+
+echo -e '''{
+ "abstract" : "Report all allocated perl objects",
+ "author" : [
+ "Josh Jore <jjore@cpan.org>"
+ ],
+ "dynamic_config" : 0,
+ "generated_by" : "Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400",
+ "license" : [
+ "perl_5"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : "2"
+ },
+ "name" : "Internals-CountObjects",
+ "prereqs" : {
+ "build" : {
+ "requires" : {
+ "ExtUtils::CBuilder" : 0
+ }
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "bugtracker" : {
+ "mailto" : "bug-Internals-CountObjects@rt.cpan.org",
+ "web" : "http://rt.cpan.org/NoAuth/Bugs.html?Dist=Internals-CountObjects"
+ },
+ "homepage" : "http://search.cpan.org/dist/Internals-CountObjects",
+ "repository" : {
+ "type" : "git",
+ "url" : "git://github.com/jbenjore/Internals-CountObjects.git",
+ "web" : "http://github.com/jbenjore/Internals-CountObjects"
+ }
+ },
+ "version" : "0.05"
+}
+''' > ${TMP}/Internals-CountObjects-0.05/META.json
+
+cd $TMP
+
+# Tar compress
+tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz Internals-CountObjects-0.01
+tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.05.tar.gz Internals-CountObjects-0.05
+
+# Move .tar.gz archives to a servable directory
+mv *.tar.gz ../../$BASE_PATH
+
+# Clean up removing tmp_dir
+cd ../../
+rm -r tmp_dir/
diff --git a/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz b/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.05.tar.gz b/swh/loader/package/cpan/tests/data/https_cpan.metacpan.org/authors_id_J_JJ_JJORE_Internals-CountObjects-0.05.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects b/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects
@@ -0,0 +1,26 @@
+{
+ "took" : 3,
+ "releases" : [
+ {
+ "maturity" : "released",
+ "authorized" : true,
+ "date" : "2011-06-11T05:23:31",
+ "name" : "Internals-CountObjects-0.05",
+ "version" : "0.05",
+ "author" : "JJORE",
+ "download_url" : "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.05.tar.gz",
+ "status" : "latest"
+ },
+ {
+ "authorized" : true,
+ "date" : "2011-06-05T18:44:02",
+ "maturity" : "released",
+ "name" : "Internals-CountObjects-0.01",
+ "version" : "0.01",
+ "author" : "JJORE",
+ "download_url" : "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.01.tar.gz",
+ "status" : "cpan"
+ }
+ ],
+ "total" : 4
+}
diff --git a/swh/loader/package/cpan/tests/test_cpan.py b/swh/loader/package/cpan/tests/test_cpan.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/tests/test_cpan.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.loader.package.cpan.loader import CpanLoader
+from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats
+from swh.model.hashutil import hash_to_bytes
+from swh.model.model import (
+ ObjectType,
+ Person,
+ Release,
+ Snapshot,
+ SnapshotBranch,
+ TargetType,
+ TimestampWithTimezone,
+)
+
+ORIGINS = [
+ "https://metacpan.org/dist/Internals-CountObjects",
+]
+
+
+def test_get_versions(requests_mock_datadir, swh_storage):
+ loader = CpanLoader(
+ swh_storage,
+ url=ORIGINS[0],
+ )
+ assert loader.get_versions() == ["0.01", "0.05"]
+
+
+def test_get_default_version(requests_mock_datadir, swh_storage):
+ loader = CpanLoader(
+ swh_storage,
+ url=ORIGINS[0],
+ )
+ assert loader.get_default_version() == "0.05"
+
+
+def test_cpan_loader_load_multiple_version(datadir, requests_mock_datadir, swh_storage):
+ loader = CpanLoader(
+ swh_storage,
+ url=ORIGINS[0],
+ )
+ load_status = loader.load()
+ assert load_status["status"] == "eventful"
+ assert load_status["snapshot_id"] is not None
+
+ expected_snapshot_id = "2b1f606033ef5ccfed78aeb94baf5a8b901b2306"
+
+ assert expected_snapshot_id == load_status["snapshot_id"]
+
+ expected_snapshot = Snapshot(
+ id=hash_to_bytes(load_status["snapshot_id"]),
+ branches={
+ b"releases/0.01": SnapshotBranch(
+ target=hash_to_bytes("3b31ce005c364de6c1b8caca8bf12487d5debf38"),
+ target_type=TargetType.RELEASE,
+ ),
+ b"releases/0.05": SnapshotBranch(
+ target=hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e"),
+ target_type=TargetType.RELEASE,
+ ),
+ b"HEAD": SnapshotBranch(
+ target=b"releases/0.05",
+ target_type=TargetType.ALIAS,
+ ),
+ },
+ )
+
+ check_snapshot(expected_snapshot, swh_storage)
+
+ stats = get_stats(swh_storage)
+ assert {
+ "content": 2,
+ "directory": 4,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 2,
+ "revision": 0,
+ "skipped_content": 0,
+ "snapshot": 1,
+ } == stats
+
+ assert swh_storage.release_get(
+ [hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e")]
+ )[0] == Release(
+ name=b"0.05",
+ message=b"Synthetic release for Perl source package Internals-CountObjects"
+ b" version 0.05\n\nReport all allocated perl objects\n",
+ target=hash_to_bytes("af3f6a43eaf4b26dbcadb1101e8d81db6d6151e0"),
+ target_type=ObjectType.DIRECTORY,
+ synthetic=True,
+ author=Person(
+ fullname=b"Josh Jore <jjore@cpan.org>",
+ name=b"Josh Jore",
+ email=b"jjore@cpan.org",
+ ),
+ date=TimestampWithTimezone.from_iso8601("2011-06-11T05:23:31+00:00"),
+ id=hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e"),
+ )
+
+ assert_last_visit_matches(
+ swh_storage,
+ url=ORIGINS[0],
+ status="full",
+ type="cpan",
+ snapshot=expected_snapshot.id,
+ )
diff --git a/swh/loader/package/cpan/tests/test_tasks.py b/swh/loader/package/cpan/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/cpan/tests/test_tasks.py
@@ -0,0 +1,23 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def test_tasks_cpan_loader(
+ mocker, swh_scheduler_celery_app, swh_scheduler_celery_worker, swh_config
+):
+ mock_load = mocker.patch("swh.loader.package.cpan.loader.CpanLoader.load")
+ mock_load.return_value = {"status": "eventful"}
+
+ res = swh_scheduler_celery_app.send_task(
+ "swh.loader.package.cpan.tasks.LoadCpan",
+ kwargs=dict(
+ url="some-url/api/packages/some-package",
+ ),
+ )
+ assert res
+ res.wait()
+ assert res.successful()
+ assert mock_load.called
+ assert res.result == {"status": "eventful"}

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 17, 4:51 PM (7 h, 6 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220351

Event Timeline