diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -7,9 +7,12 @@ import hashlib import logging import os -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, List, Optional + +import sentry_sdk from swh.core.config import load_from_envvar +from swh.loader.core.metadata_fetchers import CredentialsType, get_fetchers_for_lister from swh.loader.exception import NotFound from swh.model.model import ( BaseContent, @@ -18,6 +21,7 @@ Origin, OriginVisit, OriginVisitStatus, + RawExtrinsicMetadata, Release, Revision, Sha1Git, @@ -53,6 +57,13 @@ - :class:`PyPILoader` - :class:`NpmLoader` + Args: + lister_name: Name of the lister which triggered this load. + If provided, the loader will try to use the forge's API to retrieve extrinsic + metadata + lister_instance_name: Name of the lister instance which triggered this load. + Must be None iff lister_name is, but it may be the empty string for listers + with a single instance. """ visit_type: str @@ -66,11 +77,27 @@ logging_class: Optional[str] = None, save_data_path: Optional[str] = None, max_content_size: Optional[int] = None, + lister_name: Optional[str] = None, + lister_instance_name: Optional[str] = None, + metadata_fetcher_credentials: CredentialsType = None, ): - super().__init__() + if lister_name == "": + raise ValueError("lister_name must not be the empty string") + if lister_name is None and lister_instance_name is not None: + raise ValueError( + f"lister_name is None but lister_instance_name is {lister_instance_name!r}" + ) + if lister_name is not None and lister_instance_name is None: + raise ValueError( + f"lister_instance_name is None but lister_name is {lister_name!r}" + ) + self.storage = storage self.origin = Origin(url=origin_url) self.max_content_size = int(max_content_size) if max_content_size else None + self.lister_name = lister_name + self.lister_instance_name = lister_instance_name + self.metadata_fetcher_credentials = metadata_fetcher_credentials or {} if logging_class is None: logging_class = "%s.%s" % ( @@ -297,6 +324,24 @@ "Load origin '%s' with type '%s'", self.origin.url, self.visit.type ) + try: + metadata = self.build_extrinsic_origin_metadata() + self.load_metadata_objects(metadata) + except Exception as e: + sentry_sdk.capture_exception(e) + # Do not fail the whole task if this is the only failure + self.log.exception( + "Failure while loading extrinsic origin metadata.", + extra={ + "swh_task_args": [], + "swh_task_kwargs": { + "origin": self.origin.url, + "lister_name": self.lister_name, + "lister_instance_name": self.lister_instance_name, + }, + }, + ) + try: self.prepare() @@ -329,7 +374,11 @@ status, extra={ "swh_task_args": [], - "swh_task_kwargs": {"origin": self.origin.url}, + "swh_task_kwargs": { + "origin": self.origin.url, + "lister_name": self.lister_name, + "lister_instance_name": self.lister_instance_name, + }, }, ) visit_status = OriginVisitStatus( @@ -349,6 +398,43 @@ return self.load_status() + def load_metadata_objects( + self, metadata_objects: List[RawExtrinsicMetadata] + ) -> None: + if not metadata_objects: + return + + authorities = {mo.authority for mo in metadata_objects} + self.storage.metadata_authority_add(list(authorities)) + + fetchers = {mo.fetcher for mo in metadata_objects} + self.storage.metadata_fetcher_add(list(fetchers)) + + self.storage.raw_extrinsic_metadata_add(metadata_objects) + + def build_extrinsic_origin_metadata(self) -> List[RawExtrinsicMetadata]: + """Builds a list of full RawExtrinsicMetadata objects, using + a metadata fetcher returned by :func:`get_fetcher_classes`.""" + if self.lister_name is None: + self.log.debug("lister_not provided, skipping extrinsic origin metadata") + return [] + + assert ( + self.lister_instance_name is not None + ), "lister_instance_name is None, but lister_name is not" + + metadata = [] + for cls in get_fetchers_for_lister(self.lister_name): + metadata_fetcher = cls( + origin=self.origin, + lister_name=self.lister_name, + lister_instance_name=self.lister_instance_name, + credentials=self.metadata_fetcher_credentials, + ) + metadata.extend(metadata_fetcher.get_origin_metadata()) + + return metadata + class DVCSLoader(BaseLoader): """This base class is a pattern for dvcs loaders (e.g. git, mercurial). diff --git a/swh/loader/core/metadata_fetchers.py b/swh/loader/core/metadata_fetchers.py new file mode 100644 --- /dev/null +++ b/swh/loader/core/metadata_fetchers.py @@ -0,0 +1,47 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import functools +from typing import Dict, List, Optional, Set, Type + +import pkg_resources +from typing_extensions import Protocol, runtime_checkable + +from swh.model.model import Origin, RawExtrinsicMetadata + +CredentialsType = Optional[Dict[str, Dict[str, List[Dict[str, str]]]]] + + +@runtime_checkable +class MetadataFetcherProtocol(Protocol): + """Interface provided by :class:`swh.loader.metadata.base.BaseMetadataFetcher` + to loaders, via setuptools entrypoints.""" + + SUPPORTED_LISTERS: Set[str] + + def __init__( + self, + origin: Origin, + credentials: CredentialsType, + lister_name: str, + lister_instance_name: str, + ): + ... + + def get_origin_metadata(self) -> List[RawExtrinsicMetadata]: + ... + + +@functools.lru_cache() +def _fetchers() -> List[Type[MetadataFetcherProtocol]]: + classes = [] + for entry_point in pkg_resources.iter_entry_points("swh.loader.metadata"): + classes.append(entry_point.load()) + + return classes + + +def get_fetchers_for_lister(lister_name: str) -> List[Type[MetadataFetcherProtocol]]: + return [cls for cls in _fetchers() if lister_name in cls.SUPPORTED_LISTERS] diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -6,6 +6,9 @@ import datetime import hashlib import logging +from unittest.mock import MagicMock + +import pytest from swh.loader.core.loader import BaseLoader, DVCSLoader from swh.loader.exception import NotFound @@ -19,6 +22,7 @@ RawExtrinsicMetadata, Snapshot, ) +import swh.storage.exc ORIGIN = Origin(url="some-url") @@ -89,6 +93,16 @@ pass +class DummyMetadataFetcher: + SUPPORTED_LISTERS = {"fake-lister"} + + def __init__(self, origin, credentials, lister_name, lister_instance_name): + pass + + def get_origin_metadata(self): + return [REMD] + + def test_base_loader(swh_storage): loader = DummyBaseLoader(swh_storage) result = loader.load() @@ -101,6 +115,49 @@ assert result == {"status": "eventful"} +def test_base_loader_with_known_lister_name(swh_storage, mocker): + fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) + fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS + mocker.patch( + "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] + ) + + loader = DummyBaseLoader( + swh_storage, lister_name="fake-lister", lister_instance_name="" + ) + result = loader.load() + assert result == {"status": "eventful"} + + fetcher_cls.assert_called_once() + fetcher_cls.assert_called_once_with( + origin=ORIGIN, + credentials={}, + lister_name="fake-lister", + lister_instance_name="", + ) + assert swh_storage.raw_extrinsic_metadata_get( + ORIGIN.swhid(), METADATA_AUTHORITY + ).results == [REMD] + + +def test_base_loader_with_unknown_lister_name(swh_storage, mocker): + fetcher_cls = MagicMock(wraps=DummyMetadataFetcher) + fetcher_cls.SUPPORTED_LISTERS = DummyMetadataFetcher.SUPPORTED_LISTERS + mocker.patch( + "swh.loader.core.metadata_fetchers._fetchers", return_value=[fetcher_cls] + ) + + loader = DummyBaseLoader( + swh_storage, lister_name="other-lister", lister_instance_name="" + ) + result = loader.load() + assert result == {"status": "eventful"} + + fetcher_cls.assert_not_called() + with pytest.raises(swh.storage.exc.StorageArgumentException): + swh_storage.raw_extrinsic_metadata_get(ORIGIN.swhid(), METADATA_AUTHORITY) + + def test_dvcs_loader(swh_storage): loader = DummyDVCSLoader(swh_storage) result = loader.load() diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -16,7 +16,6 @@ Any, Dict, Generic, - Iterable, Iterator, List, Mapping, @@ -737,7 +736,7 @@ if snapshot: try: metadata_objects = self.build_extrinsic_snapshot_metadata(snapshot.id) - self._load_metadata_objects(metadata_objects) + self.load_metadata_objects(metadata_objects) except Exception as e: error = ( f"Failed to load extrinsic snapshot metadata for {self.origin.url}" @@ -750,7 +749,7 @@ try: metadata_objects = self.build_extrinsic_origin_metadata() - self._load_metadata_objects(metadata_objects) + self.load_metadata_objects(metadata_objects) except Exception as e: error = f"Failed to load extrinsic origin metadata for {self.origin.url}" logger.exception(error) @@ -839,7 +838,7 @@ origin=self.origin.url, release=release.swhid(), ) - self._load_metadata_objects([original_artifact_metadata]) + self.load_metadata_objects([original_artifact_metadata]) logger.debug("Release: %s", release) @@ -1046,34 +1045,7 @@ metadata_objects = self.build_extrinsic_directory_metadata( p_info, release_id, directory_id ) - self._load_metadata_objects(metadata_objects) - - def _load_metadata_objects( - self, metadata_objects: List[RawExtrinsicMetadata] - ) -> None: - if not metadata_objects: - # If this package loader doesn't write metadata, no need to require - # an implementation for get_metadata_authority. - return - - self._create_authorities(mo.authority for mo in metadata_objects) - self._create_fetchers(mo.fetcher for mo in metadata_objects) - - self.storage.raw_extrinsic_metadata_add(metadata_objects) - - def _create_authorities(self, authorities: Iterable[MetadataAuthority]) -> None: - deduplicated_authorities = { - (authority.type, authority.url): authority for authority in authorities - } - if authorities: - self.storage.metadata_authority_add(list(deduplicated_authorities.values())) - - def _create_fetchers(self, fetchers: Iterable[MetadataFetcher]) -> None: - deduplicated_fetchers = { - (fetcher.name, fetcher.version): fetcher for fetcher in fetchers - } - if fetchers: - self.storage.metadata_fetcher_add(list(deduplicated_fetchers.values())) + self.load_metadata_objects(metadata_objects) def _load_extids(self, extids: Set[ExtID]) -> None: if not extids: