diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -15,6 +15,7 @@ "origin", "origin_visit_status", "origin_intrinsic_metadata", + "origin_extrinsic_metadata", } @@ -80,6 +81,9 @@ if "origin_intrinsic_metadata" in messages: process_origin_intrinsic_metadata(messages["origin_intrinsic_metadata"], search) + if "origin_extrinsic_metadata" in messages: + process_origin_extrinsic_metadata(messages["origin_extrinsic_metadata"], search) + def process_origins(origins, search): logging.debug("processing origins %r", origins) @@ -132,3 +136,17 @@ ] search.origin_update(origin_metadata) + + +def process_origin_extrinsic_metadata(origin_metadata, search): + logging.debug("processing origin extrinsic_metadata %r", origin_metadata) + + origin_metadata = [ + { + "url": item["id"], + "jsonld": item["metadata"], + } + for item in origin_metadata + ] + + search.origin_update(origin_metadata) diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -10,6 +10,7 @@ from click.testing import CliRunner from confluent_kafka import Producer import pytest +from typing_extensions import Literal import yaml from swh.journal.serializers import value_to_kafka @@ -175,13 +176,18 @@ assert actual_page.results == [origin_foobar] -def test__journal_client__origin_intrinsic_metadata( - swh_search, elasticsearch_host, kafka_prefix: str, kafka_server +@pytest.mark.parametrize("metadata_source", ["intrinsic", "extrinsic"]) +def test__journal_client__origin_metadata( + swh_search, + elasticsearch_host, + kafka_prefix: str, + kafka_server, + metadata_source: Literal["intrinsic", "extrinsic"], ): """Subscribing to origin-intrinsic-metadata should result in swh-search indexation""" origin_foobar = {"url": "https://github.com/clojure/clojure"} - origin_intrinsic_metadata = { + origin_metadata = { "id": origin_foobar["url"], "metadata": { "name": "clojure", @@ -194,9 +200,27 @@ "codeRepository": "https://repo.maven.apache.org/maven2/org/clojure/clojure", # noqa }, "indexer_configuration_id": 1, - "from_revision": hash_to_bytes("f47c139e20970ee0852166f48ee2a4626632b86e"), - "mappings": ["maven"], } + if metadata_source == "intrinsic": + origin_metadata.update( + { + "from_revision": hash_to_bytes( + "f47c139e20970ee0852166f48ee2a4626632b86e" + ), + "mappings": ["maven"], + } + ) + elif metadata_source == "extrinsic": + origin_metadata.update( + { + "from_revision": hash_to_bytes( + "f47c139e20970ee0852166f48ee2a4626632b86e" + ), + "mappings": ["github"], + } + ) + else: + assert False, metadata_source producer = Producer( { @@ -205,9 +229,9 @@ "acks": "all", } ) - topic = f"{kafka_prefix}.origin_intrinsic_metadata" - value = value_to_kafka(origin_intrinsic_metadata) - producer.produce(topic=topic, key=b"bogus-origin-intrinsic-metadata", value=value) + topic = f"{kafka_prefix}.origin_{metadata_source}_metadata" + value = value_to_kafka(origin_metadata) + producer.produce(topic=topic, key=b"bogus-origin-metadata", value=value) producer.flush() journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( @@ -221,7 +245,7 @@ "--stop-after-objects", "1", "--object-type", - "origin_intrinsic_metadata", + f"origin_{metadata_source}_metadata", ], journal_objects_config, elasticsearch_host=elasticsearch_host, diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -276,7 +276,10 @@ ) -def test_journal_client_origin_metadata_from_journal(): +@pytest.mark.parametrize( + "key", ["origin_intrinsic_metadata", "origin_extrinsic_metadata"] +) +def test_journal_client_origin_metadata_from_journal(key): search_mock = MagicMock() worker_fn = functools.partial( @@ -286,7 +289,7 @@ worker_fn( { - "origin_intrinsic_metadata": [ + key: [ { "id": "http://foobar.baz", "metadata": {