Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/rubygems/loader.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
import os | import os | ||||
import string | |||||
from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple | from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple | ||||
import attr | import attr | ||||
from packaging.version import parse as parse_version | |||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import ( | ||||
from swh.loader.package.utils import cached_method, get_url_body, release_name | BasePackageInfo, | ||||
PackageLoader, | |||||
RawExtrinsicMetadataCore, | |||||
) | |||||
from swh.loader.package.utils import get_url_body, release_name | |||||
from swh.model import from_disk | from swh.model import from_disk | ||||
from swh.model.model import ObjectType, Person, Release, Sha1Git, TimestampWithTimezone | from swh.model.model import ( | ||||
MetadataAuthority, | |||||
MetadataAuthorityType, | |||||
ObjectType, | |||||
Person, | |||||
Release, | |||||
Sha1Git, | |||||
TimestampWithTimezone, | |||||
) | |||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
@attr.s | @attr.s | ||||
class RubyGemsPackageInfo(BasePackageInfo): | class RubyGemsPackageInfo(BasePackageInfo): | ||||
name = attr.ib(type=str) | name = attr.ib(type=str) | ||||
"""Name of the package""" | """Name of the package""" | ||||
version = attr.ib(type=str) | version = attr.ib(type=str) | ||||
"""Current version""" | """Current version""" | ||||
built_at = attr.ib(type=Optional[TimestampWithTimezone]) | built_at = attr.ib(type=Optional[TimestampWithTimezone]) | ||||
"""Version build date""" | """Version build date""" | ||||
authors = attr.ib(type=List[Person]) | authors = attr.ib(type=List[Person]) | ||||
"""Authors""" | """Authors""" | ||||
sha256 = attr.ib(type=str) | |||||
"""Extid as sha256""" | |||||
MANIFEST_FORMAT = string.Template( | |||||
"name $name\nshasum $sha256\nurl $url\nversion $version\nlast_update $built_at" | |||||
) | |||||
EXTID_TYPE = "rubygems-manifest-sha256" | |||||
EXTID_VERSION = 0 | |||||
class RubyGemsLoader(PackageLoader[RubyGemsPackageInfo]): | class RubyGemsLoader(PackageLoader[RubyGemsPackageInfo]): | ||||
"""Load ``.gem`` files from ``RubyGems.org`` into the SWH archive.""" | """Load ``.gem`` files from ``RubyGems.org`` into the SWH archive.""" | ||||
visit_type = "rubygems" | visit_type = "rubygems" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
artifacts: List[Dict[str, Any]], | |||||
rubygems_metadata: List[Dict[str, Any]], | |||||
max_content_size: Optional[int] = None, | max_content_size: Optional[int] = None, | ||||
**kwargs, | **kwargs, | ||||
): | ): | ||||
super().__init__(storage, url, max_content_size=max_content_size, **kwargs) | super().__init__(storage, url, max_content_size=max_content_size, **kwargs) | ||||
# Lister URLs are in the ``https://rubygems.org/gems/{pkgname}`` format | # Lister URLs are in the ``https://rubygems.org/gems/{pkgname}`` format | ||||
assert url.startswith("https://rubygems.org/gems/"), ( | assert url.startswith("https://rubygems.org/gems/"), ( | ||||
"Expected rubygems.org url, got '%s'" % url | "Expected rubygems.org url, got '%s'" % url | ||||
) | ) | ||||
self.gem_name = url[len("https://rubygems.org/gems/") :] | # Convert list of artifacts and rubygems_metadata to a mapping of version | ||||
# API docs at ``https://guides.rubygems.org/rubygems-org-api/`` | self.artifacts: Dict[str, Dict] = { | ||||
self.api_base_url = "https://rubygems.org/api/v1" | artifact["version"]: artifact for artifact in artifacts | ||||
# Mapping of version number to corresponding metadata from the API | } | ||||
self.versions_info: Dict[str, Dict[str, Any]] = {} | self.rubygems_metadata: Dict[str, Dict] = { | ||||
data["version"]: data for data in rubygems_metadata | |||||
} | |||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
"""Return all versions for the gem being loaded. | """Return all versions sorted for the gem being loaded""" | ||||
versions = list(self.artifacts.keys()) | |||||
Also stores the detailed information for each version since everything | versions.sort(key=parse_version) | ||||
is present in this API call.""" | |||||
versions_info = get_url_body( | |||||
f"{self.api_base_url}/versions/{self.gem_name}.json" | |||||
) | |||||
versions = [] | |||||
for version_info in json.loads(versions_info): | |||||
number = version_info["number"] | |||||
self.versions_info[number] = version_info | |||||
versions.append(number) | |||||
return versions | return versions | ||||
@cached_method | |||||
def get_default_version(self) -> str: | def get_default_version(self) -> str: | ||||
latest = get_url_body( | """Get the newest release version of a gem""" | ||||
f"{self.api_base_url}/versions/{self.gem_name}/latest.json" | return self.get_versions()[-1] | ||||
def get_metadata_authority(self): | |||||
return MetadataAuthority( | |||||
type=MetadataAuthorityType.FORGE, | |||||
url="https://rubygems.org/", | |||||
) | ) | ||||
return json.loads(latest)["version"] | |||||
def _load_directory( | def _load_directory( | ||||
self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str | self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], tmpdir: str | ||||
) -> Tuple[str, from_disk.Directory]: | ) -> Tuple[str, from_disk.Directory]: | ||||
"""Override the directory loading to point it to the actual code. | """Override the directory loading to point it to the actual code. | ||||
Gem files are uncompressed tarballs containing: | Gem files are uncompressed tarballs containing: | ||||
- ``metadata.gz``: the metadata about this gem | - ``metadata.gz``: the metadata about this gem | ||||
- ``data.tar.gz``: the code and possible binary artifacts | - ``data.tar.gz``: the code and possible binary artifacts | ||||
- ``checksums.yaml.gz``: checksums | - ``checksums.yaml.gz``: checksums | ||||
""" | """ | ||||
anlambert: I think we should use the value from the `built_at` field instead for the release date.
For… | |||||
logger.debug("Unpacking gem file to point to the actual code") | logger.debug("Unpacking gem file to point to the actual code") | ||||
uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) | uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) | ||||
source_code_tarball = os.path.join(uncompressed_path, "data.tar.gz") | source_code_tarball = os.path.join(uncompressed_path, "data.tar.gz") | ||||
return super()._load_directory([(source_code_tarball, {})], tmpdir) | return super()._load_directory([(source_code_tarball, {})], tmpdir) | ||||
def get_package_info( | def get_package_info( | ||||
self, version: str | self, version: str | ||||
) -> Iterator[Tuple[str, RubyGemsPackageInfo]]: | ) -> Iterator[Tuple[str, RubyGemsPackageInfo]]: | ||||
info = self.versions_info[version] | artifact = self.artifacts[version] | ||||
rubygem_metadata = self.rubygems_metadata[version] | |||||
filename = artifact["filename"] | |||||
gem_name = filename.split(f"-{version}.gem")[0] | |||||
authors = rubygem_metadata["authors"].split(", ") | |||||
checksums = artifact["checksums"] | |||||
# Get extrinsic metadata | |||||
extrinsic_metadata_url = rubygem_metadata["extrinsic_metadata_url"] | |||||
extrinsic_metadata_raw = get_url_body(extrinsic_metadata_url) | |||||
extrinsic_metadata = json.loads(extrinsic_metadata_raw) | |||||
anlambertUnsubmitted Not Done Inline Actionsthere is no need to parse the received JSON here anlambert: there is no need to parse the received JSON here | |||||
authors = info["authors"].split(", ") | |||||
p_info = RubyGemsPackageInfo( | p_info = RubyGemsPackageInfo( | ||||
url=f"https://rubygems.org/downloads/{self.gem_name}-{version}.gem", | url=artifact["url"], | ||||
# See format of gem files in ``_load_directory`` | filename=filename, | ||||
filename=f"{self.gem_name}-{version}.tar", | |||||
version=version, | version=version, | ||||
built_at=TimestampWithTimezone.from_iso8601(info["built_at"]), | built_at=TimestampWithTimezone.from_iso8601(rubygem_metadata["date"]), | ||||
name=self.gem_name, | name=gem_name, | ||||
authors=[Person.from_fullname(person.encode()) for person in authors], | authors=[Person.from_fullname(person.encode()) for person in authors], | ||||
checksums=checksums, # sha256 checksum | |||||
sha256=checksums["sha256"], # sha256 for EXTID | |||||
directory_extrinsic_metadata=[ | |||||
RawExtrinsicMetadataCore( | |||||
format="rubygem-package-json", | |||||
anlambertUnsubmitted Not Done Inline ActionsUse rubygem-release-json instead as format name (cpan loader use cpan-release-json). anlambert: Use `rubygem-release-json` instead as format name (cpan loader use `cpan-release-json`). | |||||
metadata=json.dumps([extrinsic_metadata]).encode(), | |||||
anlambertUnsubmitted Not Done Inline Actionsmetadata=extrinsic_metadata_raw anlambert: `metadata=extrinsic_metadata_raw` | |||||
), | |||||
], | |||||
) | ) | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def build_release( | def build_release( | ||||
self, p_info: RubyGemsPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: RubyGemsPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Release]: | ) -> Optional[Release]: | ||||
msg = ( | msg = ( | ||||
f"Synthetic release for RubyGems source package {p_info.name} " | f"Synthetic release for RubyGems source package {p_info.name} " | ||||
Show All 13 Lines |
I think we should use the value from the built_at field instead for the release date.
For instance if you look at all the rails versions in its HTML page,
you can see the built_at date is used for each release date (see data from REST API).