diff --git a/swh/loader/package/cpan/loader.py b/swh/loader/package/cpan/loader.py index 00300fe..830362e 100644 --- a/swh/loader/package/cpan/loader.py +++ b/swh/loader/package/cpan/loader.py @@ -1,196 +1,192 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json from pathlib import Path from typing import Any, Dict, Iterator, Optional, Sequence, Tuple import attr import iso8601 from packaging.version import parse as parse_version import yaml from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import ( EMPTY_AUTHOR, Person, cached_method, get_url_body, release_name, ) from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface @attr.s class CpanPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" filename = attr.ib(type=str) """Archive (tar.gz) file name""" version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=datetime) """File last modified date as release date.""" author = attr.ib(type=Person) """Author""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from META.json file at dir_path. Each Perl package version has a META.json file at the root of the archive, or a META.yml for older version. See https://perldoc.perl.org/CPAN::Meta for META specifications. Args: dir_path: A directory on disk where a META.json|.yml can be found Returns: A dict mapping from yaml parser """ meta_json_path = dir_path / "META.json" metadata: Dict[str, Any] = {} if meta_json_path.exists(): metadata = json.loads(meta_json_path.read_text()) meta_yml_path = dir_path / "META.yml" if meta_yml_path.exists(): metadata = yaml.safe_load(meta_yml_path.read_text()) return metadata class CpanLoader(PackageLoader[CpanPackageInfo]): visit_type = "cpan" def __init__( self, storage: StorageInterface, url: str, **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url @cached_method def info_versions(self) -> Dict: """Return the package versions (fetched from ``https://fastapi.metacpan.org/v1/release/versions/{pkgname}``) Api documentation https://cpan.haskell.org/api """ pkgname = self.url.split("/")[-1] url = f"https://fastapi.metacpan.org/v1/release/versions/{pkgname}" data = json.loads(get_url_body(url=url, headers={"Accept": "application/json"})) return {release["version"]: release for release in data["releases"]} def get_versions(self) -> Sequence[str]: """Get all released versions of a Perl package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.info_versions().keys()) versions.sort(key=parse_version) return versions def get_default_version(self) -> str: """Get the newest release version of a Perl package Returns: A string representing a version Example:: "0.10.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: """Get release name and package information from version Args: version: Package version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ data = self.info_versions()[version] pkgname: str = self.url.split("/")[-1] url: str = data["download_url"] filename: str = url.split("/")[-1] # The api does not provide an explicit timezone, defaults to UTC last_modified = iso8601.parse_date(data["date"]) if "author" in data: author = Person.from_fullname(data["author"].encode()) else: author = EMPTY_AUTHOR p_info = CpanPackageInfo( name=pkgname, filename=filename, url=url, version=version, last_modified=last_modified, author=author, ) yield release_name(version), p_info def build_release( self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from uncompressed_path/META.json|.yml intrinsic_metadata = extract_intrinsic_metadata( Path(uncompressed_path) / f"{p_info.name}-{p_info.version}" ) name: str = intrinsic_metadata["name"] assert name == p_info.name version: str = str(intrinsic_metadata["version"]) assert version == p_info.version - description = intrinsic_metadata["abstract"] - # author data from http endpoint are less complete than from META if "author" in intrinsic_metadata: author_data = intrinsic_metadata["author"] if type(author_data) is list: author = author_data[0] else: author = author_data author = Person.from_fullname(author.encode()) else: author = p_info.author message = ( - f"Synthetic release for Perl source package {name} " - f"version {version}\n\n" - f"{description}\n" + f"Synthetic release for Perl source package {name} version {version}\n" ) return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_datetime(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/cpan/tests/test_cpan.py b/swh/loader/package/cpan/tests/test_cpan.py index 4b6c284..7904fe7 100644 --- a/swh/loader/package/cpan/tests/test_cpan.py +++ b/swh/loader/package/cpan/tests/test_cpan.py @@ -1,109 +1,109 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.package.cpan.loader import CpanLoader from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) ORIGINS = [ "https://metacpan.org/dist/Internals-CountObjects", ] def test_get_versions(requests_mock_datadir, swh_storage): loader = CpanLoader( swh_storage, url=ORIGINS[0], ) assert loader.get_versions() == ["0.01", "0.05"] def test_get_default_version(requests_mock_datadir, swh_storage): loader = CpanLoader( swh_storage, url=ORIGINS[0], ) assert loader.get_default_version() == "0.05" def test_cpan_loader_load_multiple_version(datadir, requests_mock_datadir, swh_storage): loader = CpanLoader( swh_storage, url=ORIGINS[0], ) load_status = loader.load() assert load_status["status"] == "eventful" assert load_status["snapshot_id"] is not None - expected_snapshot_id = "2b1f606033ef5ccfed78aeb94baf5a8b901b2306" + expected_snapshot_id = "848ee8d69d33481c88ab81f6794f6504190f011f" assert expected_snapshot_id == load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(load_status["snapshot_id"]), branches={ b"releases/0.01": SnapshotBranch( - target=hash_to_bytes("3b31ce005c364de6c1b8caca8bf12487d5debf38"), + target=hash_to_bytes("e73aced4cc3d56b32a328d3248b25b052f029df4"), target_type=TargetType.RELEASE, ), b"releases/0.05": SnapshotBranch( - target=hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e"), + target=hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.05", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 2, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get( - [hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e")] + [hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9")] )[0] == Release( name=b"0.05", message=b"Synthetic release for Perl source package Internals-CountObjects" - b" version 0.05\n\nReport all allocated perl objects\n", + b" version 0.05\n", target=hash_to_bytes("af3f6a43eaf4b26dbcadb1101e8d81db6d6151e0"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"Josh Jore <jjore@cpan.org>", name=b"Josh Jore", email=b"jjore@cpan.org", ), date=TimestampWithTimezone.from_iso8601("2011-06-11T05:23:31+00:00"), - id=hash_to_bytes("2901106d99de31f71380b6c3b5e92799ce3a1a5e"), + id=hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9"), ) assert_last_visit_matches( swh_storage, url=ORIGINS[0], status="full", type="cpan", snapshot=expected_snapshot.id, )