diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -215,7 +215,7 @@ @indexer_cli_group.command("journal-client") @click.argument( "indexer", - type=click.Choice(["origin-intrinsic-metadata", "*"]), + type=click.Choice(["origin-intrinsic-metadata", "extrinsic-metadata", "*"]), required=False # TODO: remove required=False after we stop using it ) @@ -262,7 +262,7 @@ import functools import warnings - from swh.indexer.indexer import ObjectsDict + from swh.indexer.indexer import BaseIndexer, ObjectsDict from swh.indexer.journal_client import process_journal_objects from swh.journal.client import get_journal_client from swh.scheduler import get_scheduler @@ -303,6 +303,8 @@ ) ) + idx: Optional[BaseIndexer] = None + if indexer in ("origin-intrinsic-metadata", "*"): from swh.indexer.metadata import OriginMetadataIndexer @@ -311,6 +313,14 @@ idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) + if indexer in ("extrinsic-metadata", "*"): + from swh.indexer.metadata import ExtrinsicMetadataIndexer + + object_types.add("raw_extrinsic_metadata") + idx = ExtrinsicMetadataIndexer() + idx.catch_exceptions = False # don't commit offsets if indexation failed + worker_fns.append(idx.process_journal_objects) + if not worker_fns: raise click.ClickException(f"Unknown indexer: {indexer}") diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -9,6 +9,7 @@ from typing import Any, Dict, List from unittest.mock import patch +import attr from click.testing import CliRunner from confluent_kafka import Consumer import pytest @@ -17,12 +18,14 @@ from swh.indexer.storage.interface import IndexerStorageInterface from swh.indexer.storage.model import ( DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.journal.writer import get_journal_writer from swh.model.hashutil import hash_to_bytes -from swh.model.model import OriginVisitStatus +from swh.model.model import Origin, OriginVisitStatus +from .test_metadata import REMD from .utils import DIRECTORY2, REVISION @@ -531,7 +534,7 @@ @pytest.mark.parametrize("indexer_name", ["origin-intrinsic-metadata", "*"]) -def test_cli_journal_client_index( +def test_cli_journal_client_index__origin_intrinsic_metadata( cli_runner, swh_config, kafka_prefix: str, @@ -656,3 +659,75 @@ for status in sorted(visit_statuses_full, key=lambda r: r.origin) ] assert sorted(results, key=lambda r: r.id) == expected_results + + +@pytest.mark.parametrize("indexer_name", ["extrinsic-metadata", "*"]) +def test_cli_journal_client_index__origin_extrinsic_metadata( + cli_runner, + swh_config, + kafka_prefix: str, + kafka_server, + consumer: Consumer, + idx_storage, + storage, + mocker, + swh_indexer_config, + indexer_name: str, +): + """Test the 'swh indexer journal-client' cli tool.""" + journal_writer = get_journal_writer( + "kafka", + brokers=[kafka_server], + prefix=kafka_prefix, + client_id="test producer", + value_sanitizer=lambda object_type, value: value, + flush_timeout=3, # fail early if something is going wrong + ) + + origin = Origin("http://example.org/repo.git") + storage.origin_add([origin]) + raw_extrinsic_metadata = attr.evolve(REMD, target=origin.swhid()) + raw_extrinsic_metadata = attr.evolve( + raw_extrinsic_metadata, id=raw_extrinsic_metadata.compute_hash() + ) + journal_writer.write_additions("raw_extrinsic_metadata", [raw_extrinsic_metadata]) + + result = cli_runner.invoke( + indexer_cli_group, + [ + "-C", + swh_config, + "journal-client", + indexer_name, + "--broker", + kafka_server, + "--prefix", + kafka_prefix, + "--group-id", + "test-consumer", + "--stop-after-objects", + 1, + ], + catch_exceptions=False, + ) + + # Check the output + expected_output = "Done.\n" + assert result.exit_code == 0, result.output + assert result.output == expected_output + + results = idx_storage.origin_extrinsic_metadata_get([origin.url]) + expected_results = [ + OriginExtrinsicMetadataRow( + id=origin.url, + from_remd_id=raw_extrinsic_metadata.id, + tool={"id": 1, **swh_indexer_config["tools"]}, + mappings=["github"], + metadata={ + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "https://forgefed.org/ns#Repository", + "name": "test software", + }, + ) + ] + assert sorted(results, key=lambda r: r.id) == expected_results