diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py new file mode 100644 index 0000000..b0a3ec2 --- /dev/null +++ b/swh/search/journal_client.py @@ -0,0 +1,43 @@ +# Copyright (C) 2018-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging + + +MAX_ORIGINS_PER_TASK = 100 + +EXPECTED_MESSAGE_TYPES = {'origin', 'origin_intrinsic_metadata'} + + +def process_journal_objects(messages, *, search): + """Worker function for `JournalClient.process(worker_fn)`, after + currification of `scheduler` and `task_names`.""" + assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) + + if 'origin' in messages: + process_origins(messages['origin'], search) + + if 'origin_intrinsic_metadata' in messages: + process_origin_intrinsic_metadata( + messages['origin_intrinsic_metadata'], search) + + +def process_origins(origins, search): + logging.debug('processing origins %r', origins) + + search.origin_update(origins) + + +def process_origin_intrinsic_metadata(origin_metadata, search): + logging.debug('processing origin intrinsic_metadata %r', origin_metadata) + + origin_metadata = [ + { + 'url': item['origin_url'], + 'intrinsic_metadata': item['metadata'], + } + for item in origin_metadata] + + search.origin_update(origin_metadata) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py new file mode 100644 index 0000000..c056605 --- /dev/null +++ b/swh/search/tests/test_journal_client.py @@ -0,0 +1,65 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import functools +import unittest +from unittest.mock import MagicMock + +from swh.search.journal_client import process_journal_objects + + +class SearchJournalClientTest(unittest.TestCase): + def test_origin_from_journal(self): + search_mock = MagicMock() + + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) + + worker_fn({'origin': [ + {'url': 'http://foobar.baz'}, + ]}) + search_mock.origin_update.assert_called_once_with([ + {'url': 'http://foobar.baz'}, + ]) + + search_mock.reset_mock() + + worker_fn({'origin': [ + {'url': 'http://foobar.baz'}, + {'url': 'http://barbaz.qux'}, + ]}) + search_mock.origin_update.assert_called_once_with([ + {'url': 'http://foobar.baz'}, + {'url': 'http://barbaz.qux'}, + ]) + + def test_origin_metadata_from_journal(self): + search_mock = MagicMock() + + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) + + worker_fn({'origin_intrinsic_metadata': [ + { + 'origin_url': 'http://foobar.baz', + 'metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'foo bar', + }, + }, + ]}) + search_mock.origin_update.assert_called_once_with([ + { + 'url': 'http://foobar.baz', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'foo bar', + }, + }, + ])