Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/cpan/loader.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | |||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | |||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | |||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | |||||||||||
from datetime import datetime | from datetime import datetime | |||||||||||
import json | import json | |||||||||||
import logging | ||||||||||||
from pathlib import Path | from pathlib import Path | |||||||||||
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple | from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple | |||||||||||
import attr | import attr | |||||||||||
import iso8601 | import iso8601 | |||||||||||
from packaging.version import parse as parse_version | from packaging.version import parse as parse_version | |||||||||||
import yaml | import yaml | |||||||||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BasePackageInfo, PackageLoader | |||||||||||
from swh.loader.package.utils import ( | from swh.loader.package.utils import EMPTY_AUTHOR, Person, release_name | |||||||||||
EMPTY_AUTHOR, | ||||||||||||
Person, | ||||||||||||
cached_method, | ||||||||||||
get_url_body, | ||||||||||||
release_name, | ||||||||||||
) | ||||||||||||
from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone | from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone | |||||||||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | |||||||||||
logger = logging.getLogger(__name__) | ||||||||||||
@attr.s | @attr.s | |||||||||||
class CpanPackageInfo(BasePackageInfo): | class CpanPackageInfo(BasePackageInfo): | |||||||||||
name = attr.ib(type=str) | name = attr.ib(type=str) | |||||||||||
"""Name of the package""" | """Name of the package""" | |||||||||||
filename = attr.ib(type=str) | ||||||||||||
"""Archive (tar.gz) file name""" | ||||||||||||
version = attr.ib(type=str) | version = attr.ib(type=str) | |||||||||||
"""Current version""" | """Current version""" | |||||||||||
last_modified = attr.ib(type=datetime) | last_modified = attr.ib(type=datetime) | |||||||||||
"""File last modified date as release date.""" | """File last modified date as release date.""" | |||||||||||
author = attr.ib(type=Person) | author = attr.ib(type=Person) | |||||||||||
"""Author""" | """Author""" | |||||||||||
def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | |||||||||||
"""Extract intrinsic metadata from META.json file at dir_path. | """Extract intrinsic metadata from META.json file at dir_path. | |||||||||||
Each Perl package version has a META.json file at the root of the archive, | Most Perl package version have a META.json file at the root of the archive, | |||||||||||
or a META.yml for older version. | or a META.yml for older version. | |||||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | ||||||||||||
See https://perldoc.perl.org/CPAN::Meta for META specifications. | See https://perldoc.perl.org/CPAN::Meta for META specifications. | |||||||||||
Args: | Args: | |||||||||||
dir_path: A directory on disk where a META.json|.yml can be found | dir_path: A directory on disk where a META.json|.yml can be found | |||||||||||
Returns: | Returns: | |||||||||||
A dict mapping from yaml parser | A dict mapping from yaml parser | |||||||||||
""" | """ | |||||||||||
meta_json_path = dir_path / "META.json" | meta_json_path = dir_path / "META.json" | |||||||||||
meta_yml_path = dir_path / "META.yml" | ||||||||||||
metadata: Dict[str, Any] = {} | metadata: Dict[str, Any] = {} | |||||||||||
if meta_json_path.exists(): | if meta_json_path.exists(): | |||||||||||
metadata = json.loads(meta_json_path.read_text()) | metadata = json.loads(meta_json_path.read_text()) | |||||||||||
elif meta_yml_path.exists(): | ||||||||||||
meta_yml_path = dir_path / "META.yml" | ||||||||||||
if meta_yml_path.exists(): | ||||||||||||
metadata = yaml.safe_load(meta_yml_path.read_text()) | metadata = yaml.safe_load(meta_yml_path.read_text()) | |||||||||||
return metadata | return metadata | |||||||||||
class CpanLoader(PackageLoader[CpanPackageInfo]): | class CpanLoader(PackageLoader[CpanPackageInfo]): | |||||||||||
visit_type = "cpan" | visit_type = "cpan" | |||||||||||
def __init__( | def __init__( | |||||||||||
self, | self, | |||||||||||
storage: StorageInterface, | storage: StorageInterface, | |||||||||||
url: str, | url: str, | |||||||||||
api_base_url: str, | ||||||||||||
franckbretUnsubmitted Not Done Inline Actionsapi_base_url should be removed too? franckbret: api_base_url should be removed too? | ||||||||||||
vlorentzUnsubmitted Not Done Inline Actionsindeed vlorentz: indeed | ||||||||||||
anlambertAuthorUnsubmitted Done Inline Actionsanlambert: No, it is used in D8652 to compute extrinsic metadata URLs (@vlorentz, you asked me to do that… | ||||||||||||
vlorentzUnsubmitted Not Done Inline Actionsmy bad, I didn't connect the dots vlorentz: my bad, I didn't connect the dots | ||||||||||||
artifacts: List[Dict[str, Any]], | ||||||||||||
module_metadata: List[Dict[str, Any]], | ||||||||||||
**kwargs, | **kwargs, | |||||||||||
): | ): | |||||||||||
super().__init__(storage=storage, url=url, **kwargs) | super().__init__(storage=storage, url=url, **kwargs) | |||||||||||
self.url = url | self.url = url | |||||||||||
self.api_base_url = api_base_url | ||||||||||||
@cached_method | self.artifacts: Dict[str, Dict] = { | |||||||||||
def info_versions(self) -> Dict: | artifact["version"]: {k: v for k, v in artifact.items() if k != "version"} | |||||||||||
"""Return the package versions (fetched from | for artifact in artifacts | |||||||||||
``https://fastapi.metacpan.org/v1/release/versions/{pkgname}``) | } | |||||||||||
self.module_metadata: Dict[str, Dict] = { | ||||||||||||
Api documentation https://cpan.haskell.org/api | meta["version"]: meta for meta in module_metadata | |||||||||||
""" | } | |||||||||||
pkgname = self.url.split("/")[-1] | ||||||||||||
url = f"https://fastapi.metacpan.org/v1/release/versions/{pkgname}" | ||||||||||||
data = json.loads(get_url_body(url=url, headers={"Accept": "application/json"})) | ||||||||||||
return {release["version"]: release for release in data["releases"]} | ||||||||||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | |||||||||||
"""Get all released versions of a Perl package | """Get all released versions of a Perl package | |||||||||||
Returns: | Returns: | |||||||||||
A sequence of versions | A sequence of versions | |||||||||||
Example:: | Example:: | |||||||||||
["0.1.1", "0.10.2"] | ["0.1.1", "0.10.2"] | |||||||||||
""" | """ | |||||||||||
versions = list(self.info_versions().keys()) | versions = list(self.artifacts.keys()) | |||||||||||
versions.sort(key=parse_version) | versions.sort(key=parse_version) | |||||||||||
return versions | return versions | |||||||||||
def get_default_version(self) -> str: | def get_default_version(self) -> str: | |||||||||||
"""Get the newest release version of a Perl package | """Get the newest release version of a Perl package | |||||||||||
Returns: | Returns: | |||||||||||
A string representing a version | A string representing a version | |||||||||||
Example:: | Example:: | |||||||||||
"0.10.2" | "0.10.2" | |||||||||||
""" | """ | |||||||||||
return self.get_versions()[-1] | return self.get_versions()[-1] | |||||||||||
def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: | def get_package_info(self, version: str) -> Iterator[Tuple[str, CpanPackageInfo]]: | |||||||||||
"""Get release name and package information from version | """Get release name and package information from version | |||||||||||
Args: | Args: | |||||||||||
version: Package version (e.g: "0.1.0") | version: Package version (e.g: "0.1.0") | |||||||||||
Returns: | Returns: | |||||||||||
Iterator of tuple (release_name, p_info) | Iterator of tuple (release_name, p_info) | |||||||||||
""" | """ | |||||||||||
data = self.info_versions()[version] | artifact = self.artifacts[version] | |||||||||||
pkgname: str = self.url.split("/")[-1] | metadata = self.module_metadata[version] | |||||||||||
url: str = data["download_url"] | ||||||||||||
filename: str = url.split("/")[-1] | ||||||||||||
# The api does not provide an explicit timezone, defaults to UTC | ||||||||||||
last_modified = iso8601.parse_date(data["date"]) | ||||||||||||
if "author" in data: | last_modified = iso8601.parse_date(metadata["date"]) | |||||||||||
author = Person.from_fullname(data["author"].encode()) | author = ( | |||||||||||
else: | Person.from_fullname(metadata["author"].encode()) | |||||||||||
author = EMPTY_AUTHOR | if metadata["author"] | |||||||||||
else EMPTY_AUTHOR | ||||||||||||
) | ||||||||||||
p_info = CpanPackageInfo( | p_info = CpanPackageInfo( | |||||||||||
name=pkgname, | name=metadata["name"], | |||||||||||
filename=filename, | filename=artifact["filename"], | |||||||||||
url=url, | url=artifact["url"], | |||||||||||
version=version, | version=version, | |||||||||||
last_modified=last_modified, | last_modified=last_modified, | |||||||||||
author=author, | author=author, | |||||||||||
checksums=artifact["checksums"], | ||||||||||||
) | ) | |||||||||||
yield release_name(version), p_info | yield release_name(version), p_info | |||||||||||
def build_release( | def build_release( | |||||||||||
self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: CpanPackageInfo, uncompressed_path: str, directory: Sha1Git | |||||||||||
) -> Optional[Release]: | ) -> Optional[Release]: | |||||||||||
# Extract intrinsic metadata from uncompressed_path/META.json|.yml | # Extract intrinsic metadata from uncompressed_path/META.json|.yml | |||||||||||
intrinsic_metadata = extract_intrinsic_metadata( | intrinsic_metadata = extract_intrinsic_metadata( | |||||||||||
Path(uncompressed_path) / f"{p_info.name}-{p_info.version}" | Path(uncompressed_path) / f"{p_info.name}-{p_info.version}" | |||||||||||
) | ) | |||||||||||
name: str = intrinsic_metadata["name"] | ||||||||||||
assert name == p_info.name | ||||||||||||
version: str = str(intrinsic_metadata["version"]) | ||||||||||||
assert version == p_info.version | ||||||||||||
# author data from http endpoint are less complete than from META | # author data from http endpoint are less complete than from META | |||||||||||
if "author" in intrinsic_metadata: | if "author" in intrinsic_metadata: | |||||||||||
author_data = intrinsic_metadata["author"] | author_data = intrinsic_metadata["author"] | |||||||||||
if type(author_data) is list: | if type(author_data) is list: | |||||||||||
author = author_data[0] | author = author_data[0] | |||||||||||
else: | else: | |||||||||||
author = author_data | author = author_data | |||||||||||
author = Person.from_fullname(author.encode()) | author = Person.from_fullname(author.encode()) | |||||||||||
else: | else: | |||||||||||
author = p_info.author | author = p_info.author | |||||||||||
message = ( | message = ( | |||||||||||
f"Synthetic release for Perl source package {name} version {version}\n" | f"Synthetic release for Perl source package {p_info.name} " | |||||||||||
f"version {p_info.version}\n" | ||||||||||||
) | ) | |||||||||||
return Release( | return Release( | |||||||||||
name=version.encode(), | name=p_info.version.encode(), | |||||||||||
author=author, | author=author, | |||||||||||
date=TimestampWithTimezone.from_datetime(p_info.last_modified), | date=TimestampWithTimezone.from_datetime(p_info.last_modified), | |||||||||||
message=message.encode(), | message=message.encode(), | |||||||||||
target_type=ObjectType.DIRECTORY, | target_type=ObjectType.DIRECTORY, | |||||||||||
target=directory, | target=directory, | |||||||||||
synthetic=True, | synthetic=True, | |||||||||||
) | ) |