diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -44,6 +44,7 @@ directory: List[Dict] origin: List[Dict] origin_visit_status: List[Dict] + raw_extrinsic_metadata: List[Dict] @contextmanager diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -16,13 +16,20 @@ TypeVar, cast, ) +from urllib.parse import urlparse import sentry_sdk from swh.core.config import merge_configs from swh.core.utils import grouper from swh.indexer.codemeta import merge_documents -from swh.indexer.indexer import ContentIndexer, DirectoryIndexer, OriginIndexer +from swh.indexer.indexer import ( + BaseIndexer, + ContentIndexer, + DirectoryIndexer, + ObjectsDict, + OriginIndexer, +) from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.base import DirectoryLsEntry @@ -31,13 +38,14 @@ from swh.indexer.storage.model import ( ContentMetadataRow, DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Directory +from swh.model.model import Directory, MetadataAuthorityType from swh.model.model import ObjectType as ModelObjectType -from swh.model.model import Origin, Sha1Git -from swh.model.swhids import CoreSWHID, ObjectType +from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git +from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType REVISION_GET_BATCH_SIZE = 10 RELEASE_GET_BATCH_SIZE = 10 @@ -59,6 +67,99 @@ yield from f(list(group)) +class ExtrinsicMetadataIndexer( + BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow] +): + def process_journal_objects(self, objects: ObjectsDict) -> Dict: + summary: Dict[str, Any] = {"status": "uneventful"} + try: + results = [] + for item in objects.get("raw_extrinsic_metadata", []): + results.extend( + self.index(item["id"], data=RawExtrinsicMetadata.from_dict(item)) + ) + except Exception: + if not self.catch_exceptions: + raise + summary["status"] = "failed" + return summary + + summary_persist = self.persist_index_computations(results) + self.results = results + if summary_persist: + for value in summary_persist.values(): + if value > 0: + summary["status"] = "eventful" + summary.update(summary_persist) + return summary + + def index( + self, + id: Sha1Git, + data: Optional[RawExtrinsicMetadata], + **kwargs, + ) -> List[OriginExtrinsicMetadataRow]: + if data is None: + raise NotImplementedError( + "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data" + ) + if data.target.object_type != ExtendedObjectType.ORIGIN: + # other types are not supported yet + return [] + + if data.authority.type != MetadataAuthorityType.FORGE: + # metadata provided by a third-party; don't trust it + # (technically this could be handled below, but we check it here + # to return early; sparing a translation and origin lookup) + # TODO: add ways to define trusted authorities + return [] + + metadata_items = [] + mappings = [] + for (mapping_name, mapping) in MAPPINGS.items(): + if data.format in mapping.extrinsic_metadata_formats(): + metadata_item = mapping().translate(data.metadata) + if metadata_item is not None: + metadata_items.append(metadata_item) + mappings.append(mapping_name) + + if not metadata_items: + # Don't have any mapping to parse it, ignore + return [] + + # TODO: batch requests to origin_get_by_sha1() + origins = self.storage.origin_get_by_sha1([data.target.object_id]) + try: + (origin,) = origins + if origin is None: + raise ValueError() + except ValueError: + raise ValueError(f"Unknown origin {data.target}") from None + + if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc: + # metadata provided by a third-party; don't trust it + # TODO: add ways to define trusted authorities + return [] + + metadata = merge_documents(metadata_items) + + return [ + OriginExtrinsicMetadataRow( + id=origin["url"], + indexer_configuration_id=self.tool["id"], + from_remd_id=data.id, + mappings=mappings, + metadata=metadata, + ) + ] + + def persist_index_computations( + self, results: List[OriginExtrinsicMetadataRow] + ) -> Dict[str, int]: + """Persist the results in storage.""" + return self.idx_storage.origin_extrinsic_metadata_add(results) + + class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]): """Content-level indexer @@ -129,15 +230,7 @@ def persist_index_computations( self, results: List[ContentMetadataRow] ) -> Dict[str, int]: - """Persist the results in storage. - - Args: - results: list of content_metadata, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - metadata (jsonb): detected metadata - - """ + """Persist the results in storage.""" return self.idx_storage.content_metadata_add(results) @@ -239,16 +332,7 @@ def persist_index_computations( self, results: List[DirectoryIntrinsicMetadataRow] ) -> Dict[str, int]: - """Persist the results in storage. - - Args: - results: list of content_mimetype, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes - - """ + """Persist the results in storage.""" # TODO: add functions in storage to keep data in # directory_intrinsic_metadata return self.idx_storage.directory_intrinsic_metadata_add(results) diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -77,6 +77,7 @@ raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats") def translate(self, file_content: bytes) -> Optional[Dict]: + """Translates intrinsic metadata, from the content of a file.""" raise NotImplementedError(f"{self.__class__.__name__}.translate") def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: diff --git a/swh/indexer/metadata_dictionary/ruby.py b/swh/indexer/metadata_dictionary/ruby.py --- a/swh/indexer/metadata_dictionary/ruby.py +++ b/swh/indexer/metadata_dictionary/ruby.py @@ -6,7 +6,7 @@ import ast import itertools import re -from typing import List +from typing import List, Tuple from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI from swh.indexer.metadata_dictionary.base import DirectoryLsEntry @@ -37,6 +37,11 @@ return [entry["sha1"]] return [] + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + # this class is only used by intrinsic metadata mappings + return () + def translate(self, raw_content): try: raw_content = raw_content.decode() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -3,10 +3,31 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.indexer.metadata import ContentMetadataIndexer, DirectoryMetadataIndexer -from swh.indexer.storage.model import ContentMetadataRow, DirectoryIntrinsicMetadataRow +import datetime +from unittest.mock import call + +import attr + +from swh.indexer.metadata import ( + ContentMetadataIndexer, + DirectoryMetadataIndexer, + ExtrinsicMetadataIndexer, +) +from swh.indexer.storage.model import ( + ContentMetadataRow, + DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, +) from swh.indexer.tests.utils import DIRECTORY2 -from swh.model.model import Directory, DirectoryEntry +from swh.model.model import ( + Directory, + DirectoryEntry, + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + RawExtrinsicMetadata, +) +from swh.model.swhids import ExtendedObjectType, ExtendedSWHID from .utils import ( BASE_TEST_CONFIG, @@ -36,6 +57,24 @@ "tools": TRANSLATOR_TOOL, } +REMD = RawExtrinsicMetadata( + target=ExtendedSWHID( + object_type=ExtendedObjectType.ORIGIN, + object_id=b"\x01" * 20, + ), + discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), + authority=MetadataAuthority( + type=MetadataAuthorityType.FORGE, + url="https://example.org/", + ), + fetcher=MetadataFetcher( + name="example-fetcher", + version="1.0.0", + ), + format="application/vnd.github.v3+json", + metadata=b'{"full_name": "test software"}', +) + class TestMetadata: """ @@ -141,3 +180,91 @@ del result.tool["id"] assert results == expected_results + + def test_extrinsic_metadata_indexer_unknown_format(self, mocker): + """Should be ignored when unknown format""" + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + + remd = attr.evolve(REMD, format="unknown format") + + results = metadata_indexer.index(remd.id, data=remd) + + assert metadata_indexer.storage.method_calls == [] + assert results == [] + + def test_extrinsic_metadata_indexer_github(self, mocker): + """Nominal case, calling the mapping and storing the result""" + origin = "https://example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + {"raw_extrinsic_metadata": [REMD.to_dict()]} + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + assert metadata_indexer.storage.method_calls == [ + call.origin_get_by_sha1([b"\x01" * 20]) + ] + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert results == [ + OriginExtrinsicMetadataRow( + id="https://example.org/jdoe/myrepo", + tool={"id": tool["id"], **TRANSLATOR_TOOL}, + metadata={ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "https://forgefed.org/ns#Repository", + "name": "test software", + }, + from_remd_id=REMD.id, + mappings=["GitHubMapping"], + ) + ] + + def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker): + """Early abort on non-forge authorities""" + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + + remd = attr.evolve( + REMD, + authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY), + ) + + results = metadata_indexer.index(remd.id, data=remd) + + assert metadata_indexer.storage.method_calls == [] + assert results == [] + + def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker): + """Should be ignored when authority URL does not match the origin""" + + origin = "https://different-domain.example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + results = metadata_indexer.index(REMD.id, data=REMD) + + assert metadata_indexer.storage.method_calls == [ + call.origin_get_by_sha1([b"\x01" * 20]) + ] + assert results == []