diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -7,7 +7,7 @@ import logging import tempfile import os - +import sys from typing import ( Any, Dict, @@ -38,7 +38,12 @@ Origin, OriginVisit, OriginVisitStatus, + MetadataAuthority, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, ) +from swh.model.identifiers import SWHID from swh.storage import get_storage from swh.storage.utils import now from swh.storage.algos.snapshot import snapshot_get_latest @@ -66,6 +71,19 @@ url = attr.ib(type=str) filename = attr.ib(type=Optional[str]) + # The following attribute has kw_only=True in order to allow subclasses + # to add attributes. Without kw_only, attributes without default values cannot + # go after attributes with default values. + # See + + revision_extrinsic_metadata = attr.ib( + type=List[Tuple[datetime.datetime, str, bytes]], default=[], kw_only=True, + ) + """Tuple elements are respectively the 'discovery_date', 'format', + and 'metadata' fields of RawExtrinsicMetadata""" + + # TODO: add support for metadata for origins, directories, and contents + @property def ID_KEYS(self): raise NotImplementedError(f"{self.__class__.__name__} is missing ID_KEYS") @@ -81,6 +99,11 @@ # Origin visit type (str) set by the loader visit_type = "" + DEFAULT_CONFIG = { + "create_authorities": ("bool", True), + "create_fetchers": ("bool", True), + } + def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). @@ -367,6 +390,8 @@ if revision_id is None: try: revision_id = self._load_revision(p_info, origin) + if revision_id: + self._load_extrinsic_revision_metadata(p_info, revision_id) self.storage.flush() status_load = "eventful" except Exception as e: @@ -517,3 +542,97 @@ self.storage.snapshot_add([snapshot]) return snapshot + + def get_loader_name(self) -> str: + """Returns a fully qualified name of this loader.""" + return f"{self.__class__.__module__}.{self.__class__.__name__}" + + def get_loader_version(self) -> str: + """Returns the version of the current loader.""" + module_name = self.__class__.__module__ or "" + module_name_parts = module_name.split(".") + + # Iterate rootward through the package hierarchy until we find a parent of this + # loader's module with a __version__ attribute. + for prefix_size in range(len(module_name_parts), 0, -1): + package_name = ".".join(module_name_parts[0:prefix_size]) + module = sys.modules[package_name] + if hasattr(module, "__version__"): + return module.__version__ # type: ignore + + # If this loader's class has no parent package with a __version__, + # it should implement it itself. + raise NotImplementedError( + f"Could not dynamically find the version of {self.get_loader_name()}." + ) + + def get_metadata_fetcher(self) -> MetadataFetcher: + """Returns a MetadataFetcher instance representing this package loader; + which is used to for adding provenance information to extracted + extrinsic metadata, if any.""" + return MetadataFetcher( + name=self.get_loader_name(), version=self.get_loader_version(), metadata={}, + ) + + def get_metadata_authority(self) -> MetadataAuthority: + """For package loaders that get extrinsic metadata, returns the authority + the metadata are coming from. + """ + raise NotImplementedError("get_metadata_authority") + + def build_extrinsic_revision_metadata( + self, p_info: TPackageInfo, revision_id: Sha1Git + ) -> List[RawExtrinsicMetadata]: + if not p_info.revision_extrinsic_metadata: + # If this package loader doesn't write metadata, no need to require + # an implementation for get_metadata_authority. + return [] + + authority = self.get_metadata_authority() + fetcher = self.get_metadata_fetcher() + + metadata_objects = [] + + for (discovery_date, format, metadata) in p_info.revision_extrinsic_metadata: + metadata_objects.append( + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=SWHID(object_type="revision", object_id=revision_id), + discovery_date=discovery_date, + authority=authority, + fetcher=fetcher, + format=format, + metadata=metadata, + origin=self.url, + ) + ) + + return metadata_objects + + def _load_extrinsic_revision_metadata( + self, p_info: TPackageInfo, revision_id: Sha1Git + ) -> None: + metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) + + authorities = { + ( + metadata_object.authority.type, + metadata_object.authority.url, + ): metadata_object.authority + for metadata_object in metadata_objects + } + if authorities: + self.storage.metadata_authority_add(authorities.values()) + + fetchers = { + ( + metadata_object.fetcher.name, + metadata_object.fetcher.version, + ): metadata_object.fetcher + for metadata_object in metadata_objects + } + if fetchers: + self.storage.metadata_fetcher_add(fetchers.values()) + + if metadata_objects: + self.storage.object_metadata_add(metadata_objects) diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/tests/test_loader_metadata.py @@ -0,0 +1,158 @@ +# Copyright (C) 2019-2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +from typing import Iterator, Optional, Sequence, Tuple + +import attr + +from swh.loader.package.loader import BasePackageInfo, PackageLoader +from swh.model.hashutil import hash_to_bytes +from swh.model.identifiers import SWHID +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + MetadataTargetType, + RawExtrinsicMetadata, + Sha1Git, +) +from swh.storage import get_storage + +from swh.loader.package import __version__ + +EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e" +FULL_SNAPSHOT_ID = "4a9b608c9f01860a627237dd2409d1d50ec4b054" + +AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="http://example.org/", +) +ORIGIN_URL = "http://example.org/archive.tgz" + +REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0") +REVISION_SWHID = SWHID(object_type="revision", object_id=REVISION_ID) + + +FETCHER = MetadataFetcher( + name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader", + version=__version__, +) + +METADATA = [ + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=REVISION_SWHID, + discovery_date=datetime.datetime.now(), + authority=AUTHORITY, + fetcher=FETCHER, + format="test-format1", + metadata=b"foo bar", + origin=ORIGIN_URL, + ), + RawExtrinsicMetadata( + type=MetadataTargetType.REVISION, + id=REVISION_SWHID, + discovery_date=datetime.datetime.now() + datetime.timedelta(seconds=1), + authority=AUTHORITY, + fetcher=FETCHER, + format="test-format2", + metadata=b"bar baz", + origin=ORIGIN_URL, + ), +] + + +class MetadataTestLoader(PackageLoader[BasePackageInfo]): + def get_versions(self) -> Sequence[str]: + return ["v1.0.0"] + + def _load_revision(self, p_info: BasePackageInfo, origin) -> Optional[Sha1Git]: + return REVISION_ID + + def get_metadata_authority(self): + return attr.evolve(AUTHORITY, metadata={}) + + def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]: + m0 = METADATA[0] + m1 = METADATA[1] + p_info = BasePackageInfo( + url=ORIGIN_URL, + filename="archive.tgz", + revision_extrinsic_metadata=[ + (m0.discovery_date, m0.format, m0.metadata), + (m1.discovery_date, m1.format, m1.metadata), + ], + ) + + yield (version, p_info) + + +def test_load_revision_metadata(swh_config, caplog): + storage = get_storage("memory") + + loader = MetadataTestLoader(ORIGIN_URL) + loader.storage = storage + + load_status = loader.load() + assert load_status == { + "status": "eventful", + "snapshot_id": FULL_SNAPSHOT_ID, + } + + result = storage.object_metadata_get( + MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + ) + assert result["next_page_token"] is None + assert result["results"] == METADATA + + assert caplog.text == "" + + +def test_existing_authority(swh_config, caplog): + storage = get_storage("memory") + + loader = MetadataTestLoader(ORIGIN_URL) + loader.storage = storage + loader.config["create_authorities"] = False + + storage.metadata_authority_add([attr.evolve(AUTHORITY, metadata={})]) + + load_status = loader.load() + assert load_status == { + "status": "eventful", + "snapshot_id": FULL_SNAPSHOT_ID, + } + + result = storage.object_metadata_get( + MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + ) + assert result["next_page_token"] is None + assert result["results"] == METADATA + + assert caplog.text == "" + + +def test_existing_fetcher(swh_config, caplog): + storage = get_storage("memory") + + loader = MetadataTestLoader(ORIGIN_URL) + loader.storage = storage + loader.config["create_fetchers"] = False + + storage.metadata_fetcher_add([attr.evolve(FETCHER, metadata={})]) + + load_status = loader.load() + assert load_status == { + "status": "eventful", + "snapshot_id": FULL_SNAPSHOT_ID, + } + + result = storage.object_metadata_get( + MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + ) + assert result["next_page_token"] is None + assert result["results"] == METADATA + + assert caplog.text == ""