diff --git a/swh/loader/package/cpan/loader.py b/swh/loader/package/cpan/loader.py index 830362e..b9d7fab 100644 --- a/swh/loader/package/cpan/loader.py +++ b/swh/loader/package/cpan/loader.py @@ -1,192 +1,179 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import json +import logging from pathlib import Path -from typing import Any, Dict, Iterator, Optional, Sequence, Tuple +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple import attr import iso8601 from packaging.version import parse as parse_version import yaml from swh.loader.package.loader import BasePackageInfo, PackageLoader -from swh.loader.package.utils import ( - EMPTY_AUTHOR, - Person, - cached_method, - get_url_body, - release_name, -) +from swh.loader.package.utils import EMPTY_AUTHOR, Person, release_name from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone from swh.storage.interface import StorageInterface +logger = logging.getLogger(__name__) + @attr.s class CpanPackageInfo(BasePackageInfo): name = attr.ib(type=str) """Name of the package""" - filename = attr.ib(type=str) - """Archive (tar.gz) file name""" - version = attr.ib(type=str) """Current version""" last_modified = attr.ib(type=datetime) """File last modified date as release date.""" author = attr.ib(type=Person) """Author""" def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: """Extract intrinsic metadata from META.json file at dir_path. - Each Perl package version has a META.json file at the root of the archive, + Most Perl package version have a META.json file at the root of the archive, or a META.yml for older version. See https://perldoc.perl.org/CPAN::Meta for META specifications. Args: dir_path: A directory on disk where a META.json|.yml can be found Returns: A dict mapping from yaml parser """ meta_json_path = dir_path / "META.json" + meta_yml_path = dir_path / "META.yml" metadata: Dict[str, Any] = {} if meta_json_path.exists(): metadata = json.loads(meta_json_path.read_text()) - - meta_yml_path = dir_path / "META.yml" - if meta_yml_path.exists(): + elif meta_yml_path.exists(): metadata = yaml.safe_load(meta_yml_path.read_text()) return metadata class CpanLoader(PackageLoader[CpanPackageInfo]): visit_type = "cpan" def __init__( self, storage: StorageInterface, url: str, + api_base_url: str, + artifacts: List[Dict[str, Any]], + module_metadata: List[Dict[str, Any]], **kwargs, ): super().__init__(storage=storage, url=url, **kwargs) self.url = url - - @cached_method - def info_versions(self) -> Dict: - """Return the package versions (fetched from - ``https://fastapi.metacpan.org/v1/release/versions/{pkgname}``) - - Api documentation https://cpan.haskell.org/api - """ - pkgname = self.url.split("/")[-1] - url = f"https://fastapi.metacpan.org/v1/release/versions/{pkgname}" - data = json.loads(get_url_body(url=url, headers={"Accept": "application/json"})) - return {release["version"]: release for release in data["releases"]} + self.api_base_url = api_base_url + self.artifacts: Dict[str, Dict] = { + artifact["version"]: {k: v for k, v in artifact.items() if k != "version"} + for artifact in artifacts + } + self.module_metadata: Dict[str, Dict] = { + meta["version"]: meta for meta in module_metadata + } def get_versions(self) -> Sequence[str]: """Get all released versions of a Perl package Returns: A sequence of versions Example:: ["0.1.1", "0.10.2"] """ - versions = list(self.info_versions().keys()) + versions = list(self.artifacts.keys()) versions.sort(key=parse_version) return versions def get_default_version(self) -> str: """Get the newest release version of a Perl package Returns: A string representing a version Example:: "0.10.2" """ return self.get_versions()[-1] def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: """Get release name and package information from version Args: version: Package version (e.g: "0.1.0") Returns: Iterator of tuple (release_name, p_info) """ - data = self.info_versions()[version] - pkgname: str = self.url.split("/")[-1] - url: str = data["download_url"] - filename: str = url.split("/")[-1] - # The api does not provide an explicit timezone, defaults to UTC - last_modified = iso8601.parse_date(data["date"]) - - if "author" in data: - author = Person.from_fullname(data["author"].encode()) - else: - author = EMPTY_AUTHOR + artifact = self.artifacts[version] + metadata = self.module_metadata[version] + + last_modified = iso8601.parse_date(metadata["date"]) + author = ( + Person.from_fullname(metadata["author"].encode()) + if metadata["author"] + else EMPTY_AUTHOR + ) p_info = CpanPackageInfo( - name=pkgname, - filename=filename, - url=url, + name=metadata["name"], + filename=artifact["filename"], + url=artifact["url"], version=version, last_modified=last_modified, author=author, + checksums=artifact["checksums"], ) yield release_name(version), p_info def build_release( self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git ) -> Optional[Release]: # Extract intrinsic metadata from uncompressed_path/META.json|.yml intrinsic_metadata = extract_intrinsic_metadata( Path(uncompressed_path) / f"{p_info.name}-{p_info.version}" ) - name: str = intrinsic_metadata["name"] - assert name == p_info.name - version: str = str(intrinsic_metadata["version"]) - assert version == p_info.version - # author data from http endpoint are less complete than from META if "author" in intrinsic_metadata: author_data = intrinsic_metadata["author"] if type(author_data) is list: author = author_data[0] else: author = author_data author = Person.from_fullname(author.encode()) else: author = p_info.author message = ( - f"Synthetic release for Perl source package {name} version {version}\n" + f"Synthetic release for Perl source package {p_info.name} " + f"version {p_info.version}\n" ) return Release( - name=version.encode(), + name=p_info.version.encode(), author=author, date=TimestampWithTimezone.from_datetime(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, ) diff --git a/swh/loader/package/cpan/tests/data/fake_cpan.sh b/swh/loader/package/cpan/tests/data/fake_cpan.sh deleted file mode 100644 index 5a23d47..0000000 --- a/swh/loader/package/cpan/tests/data/fake_cpan.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash - -# Script to generate fake Perl package archives as .tar.gz. - -set -euo pipefail - -# Create directories -readonly TMP=tmp_dir/cpan -readonly BASE_PATH=https_cpan.metacpan.org - -mkdir -p $TMP - -# tar.gz package archives -# Perl package tar.gz archive needs at least one directory with a META.json or META.yml file -mkdir -p ${TMP}/Internals-CountObjects-0.01 -mkdir -p ${TMP}/Internals-CountObjects-0.05 -mkdir -p $BASE_PATH - -echo -e """--- -abstract: 'Report all allocated perl objects' -author: - - 'Josh Jore ' -build_requires: {} -configure_requires: - ExtUtils::MakeMaker: 6.31 -dynamic_config: 0 -generated_by: 'Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400' -license: perl -meta-spec: - url: http://module-build.sourceforge.net/META-spec-v1.4.html - version: 1.4 -name: Internals-CountObjects -version: 0.01 -""" > ${TMP}/Internals-CountObjects-0.01/META.yml - -echo -e '''{ - "abstract" : "Report all allocated perl objects", - "author" : [ - "Josh Jore " - ], - "dynamic_config" : 0, - "generated_by" : "Dist::Zilla version 4.200000, CPAN::Meta::Converter version 2.102400", - "license" : [ - "perl_5" - ], - "meta-spec" : { - "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", - "version" : "2" - }, - "name" : "Internals-CountObjects", - "prereqs" : { - "build" : { - "requires" : { - "ExtUtils::CBuilder" : 0 - } - } - }, - "release_status" : "stable", - "resources" : { - "bugtracker" : { - "mailto" : "bug-Internals-CountObjects@rt.cpan.org", - "web" : "http://rt.cpan.org/NoAuth/Bugs.html?Dist=Internals-CountObjects" - }, - "homepage" : "http://search.cpan.org/dist/Internals-CountObjects", - "repository" : { - "type" : "git", - "url" : "git://github.com/jbenjore/Internals-CountObjects.git", - "web" : "http://github.com/jbenjore/Internals-CountObjects" - } - }, - "version" : "0.05" -} -''' > ${TMP}/Internals-CountObjects-0.05/META.json - -cd $TMP - -# Tar compress -tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.01.tar.gz Internals-CountObjects-0.01 -tar -czf authors_id_J_JJ_JJORE_Internals-CountObjects-0.05.tar.gz Internals-CountObjects-0.05 - -# Move .tar.gz archives to a servable directory -mv *.tar.gz ../../$BASE_PATH - -# Clean up removing tmp_dir -cd ../../ -rm -r tmp_dir/ diff --git a/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects b/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects deleted file mode 100644 index df9422b..0000000 --- a/swh/loader/package/cpan/tests/data/https_fastapi.metacpan.org/v1_release_versions_Internals-CountObjects +++ /dev/null @@ -1,26 +0,0 @@ -{ - "took" : 3, - "releases" : [ - { - "maturity" : "released", - "authorized" : true, - "date" : "2011-06-11T05:23:31", - "name" : "Internals-CountObjects-0.05", - "version" : "0.05", - "author" : "JJORE", - "download_url" : "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.05.tar.gz", - "status" : "latest" - }, - { - "authorized" : true, - "date" : "2011-06-05T18:44:02", - "maturity" : "released", - "name" : "Internals-CountObjects-0.01", - "version" : "0.01", - "author" : "JJORE", - "download_url" : "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.01.tar.gz", - "status" : "cpan" - } - ], - "total" : 4 -} diff --git a/swh/loader/package/cpan/tests/test_cpan.py b/swh/loader/package/cpan/tests/test_cpan.py index 7904fe7..afd809f 100644 --- a/swh/loader/package/cpan/tests/test_cpan.py +++ b/swh/loader/package/cpan/tests/test_cpan.py @@ -1,109 +1,156 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +# flake8: noqa: B950 + +import pytest + +from swh.loader.package import __version__ from swh.loader.package.cpan.loader import CpanLoader from swh.loader.tests import assert_last_visit_matches, check_snapshot, get_stats from swh.model.hashutil import hash_to_bytes from swh.model.model import ( - ObjectType, Person, Release, Snapshot, SnapshotBranch, TargetType, TimestampWithTimezone, ) +from swh.model.model import ObjectType as ModelObjectType + +ORIGIN_URL = "https://metacpan.org/dist/Internals-CountObjects" -ORIGINS = [ - "https://metacpan.org/dist/Internals-CountObjects", +API_BASE_URL = "https://fastapi.metacpan.org/v1" + +ORIGIN_ARTIFACTS = [ + { + "url": "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.05.tar.gz", + "filename": "CountObjects-0.05.tar.gz", + "version": "0.05", + "length": 632, + "checksums": { + "sha256": "e0ecf6ab4873fa55ff74da22a3c4ae0ab6a1409635c9cd2d6059abbb32be3a6a" + }, + }, + { + "url": "https://cpan.metacpan.org/authors/id/J/JJ/JJORE/Internals-CountObjects-0.01.tar.gz", + "filename": "CountObjects-0.01.tar.gz", + "version": "0.01", + "length": 453, + "checksums": { + "sha256": "a368004ab98c5860a8fd87e0a4c44e4ee2d1b95d9b13597519a0e644c167468a" + }, + }, +] + +ORIGIN_MODULE_METADATA = [ + { + "name": "Internals-CountObjects", + "version": "0.05", + "author": "Josh Jore ", + "cpan_author": "JJORE", + "date": "2011-06-11T05:23:31", + "release_name": "Internals-CountObjects-0.05", + }, + { + "name": "Internals-CountObjects", + "version": "0.01", + "author": "Josh Jore ", + "cpan_author": "JJORE", + "date": "2011-06-05T18:44:02", + "release_name": "Internals-CountObjects-0.01", + }, ] -def test_get_versions(requests_mock_datadir, swh_storage): - loader = CpanLoader( +@pytest.fixture +def cpan_loader(requests_mock_datadir, swh_storage): + return CpanLoader( swh_storage, - url=ORIGINS[0], + url=ORIGIN_URL, + api_base_url=API_BASE_URL, + artifacts=ORIGIN_ARTIFACTS, + module_metadata=ORIGIN_MODULE_METADATA, ) - assert loader.get_versions() == ["0.01", "0.05"] -def test_get_default_version(requests_mock_datadir, swh_storage): - loader = CpanLoader( - swh_storage, - url=ORIGINS[0], - ) - assert loader.get_default_version() == "0.05" +def test_get_versions(cpan_loader): + assert cpan_loader.get_versions() == ["0.01", "0.05"] -def test_cpan_loader_load_multiple_version(datadir, requests_mock_datadir, swh_storage): - loader = CpanLoader( - swh_storage, - url=ORIGINS[0], - ) - load_status = loader.load() +def test_get_default_version(cpan_loader): + assert cpan_loader.get_default_version() == "0.05" + + +def test_cpan_loader_load_multiple_version(cpan_loader): + + load_status = cpan_loader.load() assert load_status["status"] == "eventful" assert load_status["snapshot_id"] is not None expected_snapshot_id = "848ee8d69d33481c88ab81f6794f6504190f011f" + expected_head_release = "07382fd255ec0fc293b92aeb7e68b3fe31c174f9" assert expected_snapshot_id == load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(load_status["snapshot_id"]), branches={ b"releases/0.01": SnapshotBranch( target=hash_to_bytes("e73aced4cc3d56b32a328d3248b25b052f029df4"), target_type=TargetType.RELEASE, ), b"releases/0.05": SnapshotBranch( - target=hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9"), + target=hash_to_bytes(expected_head_release), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/0.05", target_type=TargetType.ALIAS, ), }, ) - check_snapshot(expected_snapshot, swh_storage) + storage = cpan_loader.storage + + check_snapshot(expected_snapshot, storage) - stats = get_stats(swh_storage) + stats = get_stats(storage) assert { "content": 2, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats - assert swh_storage.release_get( - [hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9")] - )[0] == Release( + head_release = storage.release_get([hash_to_bytes(expected_head_release)])[0] + + assert head_release == Release( name=b"0.05", - message=b"Synthetic release for Perl source package Internals-CountObjects" - b" version 0.05\n", + message=b"Synthetic release for Perl source package Internals-CountObjects version 0.05\n", target=hash_to_bytes("af3f6a43eaf4b26dbcadb1101e8d81db6d6151e0"), - target_type=ObjectType.DIRECTORY, + target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"Josh Jore ", name=b"Josh Jore", email=b"jjore@cpan.org", ), date=TimestampWithTimezone.from_iso8601("2011-06-11T05:23:31+00:00"), - id=hash_to_bytes("07382fd255ec0fc293b92aeb7e68b3fe31c174f9"), + id=hash_to_bytes(expected_head_release), ) assert_last_visit_matches( - swh_storage, - url=ORIGINS[0], + storage, + url=ORIGIN_URL, status="full", type="cpan", snapshot=expected_snapshot.id, ) diff --git a/swh/loader/package/cpan/tests/test_tasks.py b/swh/loader/package/cpan/tests/test_tasks.py index dc8cf1e..55cfa31 100644 --- a/swh/loader/package/cpan/tests/test_tasks.py +++ b/swh/loader/package/cpan/tests/test_tasks.py @@ -1,40 +1,52 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import uuid import pytest from swh.scheduler.model import ListedOrigin, Lister +from .test_cpan import ( + API_BASE_URL, + ORIGIN_ARTIFACTS, + ORIGIN_MODULE_METADATA, + ORIGIN_URL, +) + NAMESPACE = "swh.loader.package.cpan" @pytest.fixture def cpan_lister(): return Lister(name="cpan", instance_name="example", id=uuid.uuid4()) @pytest.fixture def cpan_listed_origin(cpan_lister): return ListedOrigin( lister_id=cpan_lister.id, - url="https://metacpan.org/dist/Software-Packager", + url=ORIGIN_URL, visit_type="cpan", + extra_loader_arguments={ + "api_base_url": API_BASE_URL, + "artifacts": ORIGIN_ARTIFACTS, + "module_metadata": ORIGIN_MODULE_METADATA, + }, ) def test_cpan_loader_task_for_listed_origin( loading_task_creation_for_listed_origin_test, cpan_lister, cpan_listed_origin, ): loading_task_creation_for_listed_origin_test( loader_class_name=f"{NAMESPACE}.loader.CpanLoader", task_function_name=f"{NAMESPACE}.tasks.LoadCpan", lister=cpan_lister, listed_origin=cpan_listed_origin, )