diff --git a/swh/loader/package/cpan/loader.py b/swh/loader/package/cpan/loader.py index cfbcec9..282fe53 100644 --- a/swh/loader/package/cpan/loader.py +++ b/swh/loader/package/cpan/loader.py @@ -1,180 +1,192 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import logging +import string from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple import attr import iso8601 from packaging.version import parse as parse_version from requests import HTTPError from swh.loader.package.loader import ( BasePackageInfo, PackageLoader, RawExtrinsicMetadataCore, ) from swh.loader.package.utils import EMPTY_AUTHOR, Person, get_url_body, release_name from swh.model.model import ( MetadataAuthority, MetadataAuthorityType, ObjectType, Release, Sha1Git, TimestampWithTimezone, ) from swh.storage.interface import StorageInterface logger = logging.getLogger(__name__) @attr.s class CpanPackageInfo(BasePackageInfo): + EXTID_TYPE = "cpan-manifest-sha256" + EXTID_VERSION = 0 + + MANIFEST_FORMAT = string.Template( + "name $name\nversion $version\ndate $last_modified\nshasum $sha256sum" + ) + name = attr.ib(type=str) """Name of the package""" version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=datetime) """File last modified date as release date.""" author = attr.ib(type=Person) """Author""" + sha256sum = attr.ib(type=str) + """sha256 checksum of package tarball""" + class CpanLoader(PackageLoader[CpanPackageInfo]): visit_type = "cpan" EXTRINSIC_METADATA_URL_PATTERN = "{api_base_url}/release/{author}/{release_name}" def __init__( self, storage: StorageInterface, url: str, api_base_url: str, artifacts: List[Dict[str, Any]], module_metadata: List[Dict[str, Any]], **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url self.api_base_url = api_base_url self.artifacts: Dict[str, Dict] = { artifact["version"]: {k: v for k, v in artifact.items() if k != "version"} for artifact in artifacts } self.module_metadata: Dict[str, Dict] = { meta["version"]: meta for meta in module_metadata } def get_metadata_authority(self): return MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://metacpan.org/", ) def get_versions(self) -> Sequence[str]: """Get all released versions of a Perl package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ versions = list(self.artifacts.keys()) versions.sort(key=parse_version) return versions def get_default_version(self) -> str: """Get the newest release version of a Perl package Returns: A string representing a version Example:: "0.10.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: """Get release name and package information from version Args: version: Package version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ artifact = self.artifacts[version] metadata = self.module_metadata[version] last_modified = iso8601.parse_date(metadata["date"]) author = ( Person.from_fullname(metadata["author"].encode()) if metadata["author"] else EMPTY_AUTHOR ) try: extrinsic_metadata_url = self.EXTRINSIC_METADATA_URL_PATTERN.format( api_base_url=self.api_base_url, author=metadata["cpan_author"], release_name=metadata["release_name"], ) version_extrinsic_metadata = get_url_body(extrinsic_metadata_url) except HTTPError: logger.warning( "Could not fetch extrinsic_metadata for module %s version %s", metadata["name"], version, ) version_extrinsic_metadata = None directory_extrinsic_metadata = [] if version_extrinsic_metadata: directory_extrinsic_metadata.append( RawExtrinsicMetadataCore( format="cpan-release-json", metadata=version_extrinsic_metadata, ) ) p_info = CpanPackageInfo( name=metadata["name"], filename=artifact["filename"], url=artifact["url"], version=version, last_modified=last_modified, author=author, checksums=artifact["checksums"], directory_extrinsic_metadata=directory_extrinsic_metadata, + sha256sum=artifact["checksums"]["sha256"], ) yield release_name(version), p_info def build_release( self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: message = ( f"Synthetic release for Perl source package {p_info.name} " f"version {p_info.version}\n" ) return Release( name=p_info.version.encode(), author=p_info.author, date=TimestampWithTimezone.from_datetime(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/cpan/tests/test_cpan.py b/swh/loader/package/cpan/tests/test_cpan.py index f865372..92a7f87 100644 --- a/swh/loader/package/cpan/tests/test_cpan.py +++ b/swh/loader/package/cpan/tests/test_cpan.py @@ -1,209 +1,229 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # flake8: noqa: B950 import json from pathlib import Path import pytest from swh.loader.package import __version__ -from swh.loader.package.cpan.loader import CpanLoader +from swh.loader.package.cpan.loader import CpanLoader, CpanPackageInfo from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( Person, RawExtrinsicMetadata, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) from swh.model.model import MetadataFetcher from swh.model.model import ObjectType as ModelObjectType from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID, ObjectType ORIGIN_URL = "https://metacpan.org/dist/Internals-CountObjects" API_BASE_URL = "https://fastapi.metacpan.org/v1" ORIGIN_ARTIFACTS = [ { "url": "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.05.tar.gz", "filename": "CountObjects-0.05.tar.gz", "version": "0.05", "length": 632, "checksums": { "sha256": "e0ecf6ab4873fa55ff74da22a3c4ae0ab6a1409635c9cd2d6059abbb32be3a6a" }, }, { "url": "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.01.tar.gz", "filename": "CountObjects-0.01.tar.gz", "version": "0.01", "length": 453, "checksums": { "sha256": "a368004ab98c5860a8fd87e0a4c44e4ee2d1b95d9b13597519a0e644c167468a" }, }, ] ORIGIN_MODULE_METADATA = [ { "name": "Internals-CountObjects", "version": "0.05", "author": "Josh Jore ", "cpan_author": "JJORE", "date": "2011-06-11T05:23:31", "release_name": "Internals-CountObjects-0.05", }, { "name": "Internals-CountObjects", "version": "0.01", "author": "Josh Jore ", "cpan_author": "JJORE", "date": "2011-06-05T18:44:02", "release_name": "Internals-CountObjects-0.01", }, ] @pytest.fixture def head_release_original_artifacts_metadata(): return json.dumps( [{k: v for k, v in ORIGIN_ARTIFACTS[0].items() if k != "version"}] ).encode() @pytest.fixture def head_release_extrinsic_metadata(datadir): return Path( datadir, "https_fastapi.metacpan.org", "v1_release_JJORE_Internals-CountObjects-0.05", ).read_bytes() @pytest.fixture def cpan_loader(requests_mock_datadir, swh_storage): return CpanLoader( swh_storage, url=ORIGIN_URL, api_base_url=API_BASE_URL, artifacts=ORIGIN_ARTIFACTS, module_metadata=ORIGIN_MODULE_METADATA, ) def test_get_versions(cpan_loader): assert cpan_loader.get_versions() == ["0.01", "0.05"] def test_get_default_version(cpan_loader): assert cpan_loader.get_default_version() == "0.05" def test_cpan_loader_load_multiple_version( cpan_loader, head_release_original_artifacts_metadata, head_release_extrinsic_metadata, ): load_status = cpan_loader.load() assert load_status["status"] == "eventful" assert load_status["snapshot_id"] is not None expected_snapshot_id = "848ee8d69d33481c88ab81f6794f6504190f011f" expected_head_release = "07382fd255ec0fc293b92aeb7e68b3fe31c174f9" assert expected_snapshot_id == load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(load_status["snapshot_id"]), branches={ b"releases/0.01": SnapshotBranch( target=hash_to_bytes("e73aced4cc3d56b32a328d3248b25b052f029df4"), target_type=TargetType.RELEASE, ), b"releases/0.05": SnapshotBranch( target=hash_to_bytes(expected_head_release), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.05", target_type=TargetType.ALIAS, ), }, ) storage = cpan_loader.storage check_snapshot(expected_snapshot, storage) stats = get_stats(storage) assert { "content": 2, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats head_release = storage.release_get([hash_to_bytes(expected_head_release)])[0] assert head_release == Release( name=b"0.05", message=b"Synthetic release for Perl source package Internals-CountObjects version 0.05\n", target=hash_to_bytes("af3f6a43eaf4b26dbcadb1101e8d81db6d6151e0"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"Josh Jore ", name=b"Josh Jore", email=b"jjore@cpan.org", ), date=TimestampWithTimezone.from_iso8601("2011-06-11T05:23:31+00:00"), id=hash_to_bytes(expected_head_release), ) assert_last_visit_matches( storage, url=ORIGIN_URL, status="full", type="cpan", snapshot=expected_snapshot.id, ) release_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=head_release.id) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=head_release.target ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=cpan_loader.get_metadata_authority(), fetcher=MetadataFetcher( name="swh.loader.package.cpan.loader.CpanLoader", version=__version__, ), discovery_date=cpan_loader.visit_date, format="cpan-release-json", metadata=head_release_extrinsic_metadata, origin=ORIGIN_URL, release=release_swhid, ), ] assert ( cpan_loader.storage.raw_extrinsic_metadata_get( directory_swhid, cpan_loader.get_metadata_authority(), ).results == expected_metadata ) + + package_extids = [ + package_info.extid() + for version in cpan_loader.get_versions() + for _, package_info in cpan_loader.get_package_info(version) + ] + + extids = storage.extid_get_from_extid( + id_type=CpanPackageInfo.EXTID_TYPE, + ids=[extid for (_, _, extid) in package_extids], + version=CpanPackageInfo.EXTID_VERSION, + ) + + release_swhids = { + CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target) + for branch in expected_snapshot.branches.values() + if branch.target_type == TargetType.RELEASE + } + + assert {extid.target for extid in extids} == release_swhids