diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from copy import deepcopy +import itertools from typing import ( Any, Callable, @@ -73,19 +74,19 @@ def process_journal_objects(self, objects: ObjectsDict) -> Dict: summary: Dict[str, Any] = {"status": "uneventful"} try: - results = [] + results = {} for item in objects.get("raw_extrinsic_metadata", []): remd = RawExtrinsicMetadata.from_dict(item) sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid()) - results.extend(self.index(remd.id, data=remd)) + results[remd.target] = self.index(remd.id, data=remd) except Exception: if not self.catch_exceptions: raise summary["status"] = "failed" return summary - summary_persist = self.persist_index_computations(results) - self.results = results + self.results = list(itertools.chain.from_iterable(results.values())) + summary_persist = self.persist_index_computations(self.results) if summary_persist: for value in summary_persist.values(): if value > 0: diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -281,3 +281,32 @@ call.origin_get_by_sha1([b"\x01" * 20]) ] assert results == [] + + def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker): + """Nominal case, calling the mapping and storing the result""" + origin = "https://example.org/jdoe/myrepo" + + metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG) + metadata_indexer.catch_exceptions = False + metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage") + metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}] + + tool = metadata_indexer.idx_storage.indexer_configuration_get( + {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} + ) + assert tool is not None + + assert metadata_indexer.process_journal_objects( + { + "raw_extrinsic_metadata": [ + REMD.to_dict(), + {**REMD.to_dict(), "id": b"\x00" * 20}, + ] + } + ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1} + + results = list( + metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin]) + ) + assert len(results) == 1, results + assert results[0].from_remd_id == b"\x00" * 20