diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import datetime +import json import logging import tempfile import os @@ -40,6 +41,7 @@ OriginVisit, OriginVisitStatus, MetadataAuthority, + MetadataAuthorityType, MetadataFetcher, MetadataTargetType, RawExtrinsicMetadata, @@ -56,6 +58,16 @@ logger = logging.getLogger(__name__) +SWH_METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, + url="https://softwareheritage.org/", + metadata={}, +) +"""Metadata authority for extrinsic metadata generated by Software Heritage. +Used for metadata on "original artifacts", ie. length, filename, and checksums +of downloaded archive files.""" + + @attr.s class RawExtrinsicMetadataCore: """Contains the core of the metadata extracted by a loader, that will be @@ -175,7 +187,7 @@ uncompressed_path: Artifact uncompressed path on disk Returns: - SWH data dict + Revision object """ raise NotImplementedError("build_revision") @@ -529,17 +541,32 @@ # skipping those return None + metadata = [metadata for (filepath, metadata) in dl_artifacts] extra_metadata: Tuple[str, Any] = ( "original_artifact", - [hashes for _, hashes in dl_artifacts], + metadata, ) + if revision.metadata is not None: full_metadata = list(revision.metadata.items()) + [extra_metadata] else: full_metadata = [extra_metadata] + # TODO: don't add these extrinsic metadata to the revision. revision = attr.evolve(revision, metadata=ImmutableDict(full_metadata)) + original_artifact_metadata = RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=SWHID(object_type="revision", object_id=revision.id), + discovery_date=self.visit_date, + authority=SWH_METADATA_AUTHORITY, + fetcher=self.get_metadata_fetcher(), + format="original-artifact-json", + metadata=json.dumps(metadata).encode(), + origin=self.url, + ) + self._load_metadata_objects([original_artifact_metadata]) + logger.debug("Revision: %s", revision) self.storage.revision_add([revision]) diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py --- a/swh/loader/package/tests/test_loader_metadata.py +++ b/swh/loader/package/tests/test_loader_metadata.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information import datetime -from typing import Iterator, List, Optional, Sequence, Tuple +from typing import Iterator, List, Sequence, Tuple import attr @@ -20,7 +20,10 @@ MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + Person, RawExtrinsicMetadata, + Revision, + RevisionType, Sha1Git, ) from swh.storage import get_storage @@ -86,8 +89,29 @@ def get_versions(self) -> Sequence[str]: return ["v1.0.0"] - def _load_revision(self, p_info: BasePackageInfo, origin) -> Optional[Sha1Git]: - return REVISION_ID + def _load_directory(self, dl_artifacts, tmpdir): + class directory: + hash = None + + return (None, directory) # just enough for _load_revision to work + + def download_package(self, p_info: BasePackageInfo, tmpdir: str): + return [("path", {"artifact_key": "value", "length": 0})] + + def build_revision( + self, p_info: BasePackageInfo, uncompressed_path: str, directory: Sha1Git + ): + return Revision( + id=REVISION_ID, + message=b"", + author=Person.from_fullname(b""), + committer=Person.from_fullname(b""), + date=None, + committer_date=None, + type=RevisionType.TAR, + directory=b"foo", + synthetic=False, + ) def get_metadata_authority(self): return attr.evolve(AUTHORITY, metadata={}) @@ -111,6 +135,39 @@ return [RawExtrinsicMetadataCore(m.format, m.metadata, m.discovery_date)] +def test_load_artifact_metadata(swh_config, caplog): + storage = get_storage("memory") + + loader = MetadataTestLoader(ORIGIN_URL) + loader.storage = storage + + load_status = loader.load() + assert load_status == { + "status": "eventful", + "snapshot_id": FULL_SNAPSHOT_ID, + } + + authority = MetadataAuthority( + type=MetadataAuthorityType.REGISTRY, url="https://softwareheritage.org/", + ) + + result = storage.raw_extrinsic_metadata_get( + MetadataTargetType.REVISION, REVISION_SWHID, authority, + ) + assert result.next_page_token is None + assert len(result.results) == 1 + assert result.results[0] == RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=REVISION_SWHID, + discovery_date=result.results[0].discovery_date, + authority=authority, + fetcher=FETCHER, + format="original-artifact-json", + metadata=b'[{"artifact_key": "value", "length": 0}]', + origin=ORIGIN_URL, + ) + + def test_load_metadata(swh_config, caplog): storage = get_storage("memory")