diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py index 5a7d793..236f428 100644 --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,77 +1,79 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging EXPECTED_MESSAGE_TYPES = { "origin", "origin_visit", "origin_visit_status", "origin_intrinsic_metadata", } def process_journal_objects(messages, *, search): """Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`.""" assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) if "origin" in messages: process_origins(messages["origin"], search) if "origin_visit" in messages: process_origin_visits(messages["origin_visit"], search) if "origin_visit_status" in messages: - process_origin_visits(messages["origin_visit_status"], search) + process_origin_visit_statuses(messages["origin_visit_status"], search) if "origin_intrinsic_metadata" in messages: process_origin_intrinsic_metadata(messages["origin_intrinsic_metadata"], search) def process_origins(origins, search): logging.debug("processing origins %r", origins) search.origin_update(origins) def process_origin_visits(visits, search): logging.debug("processing origin visits %r", visits) search.origin_update( [ { "url": ( visit["origin"] if isinstance(visit["origin"], str) else visit["origin"]["url"] ), "has_visits": True, } for visit in visits ] ) def process_origin_visit_statuses(visit_statuses, search): logging.debug("processing origin visit statuses %r", visit_statuses) - search.origin_update( - [ - {"url": (visit_status["origin"]), "has_visits": True,} - for visit_status in visit_statuses - ] - ) + full_visit_status = [ + {"url": (visit_status["origin"]), "has_visits": True,} + for visit_status in visit_statuses + if visit_status["status"] == "full" + ] + + if full_visit_status: + search.origin_update(full_visit_status) def process_origin_intrinsic_metadata(origin_metadata, search): logging.debug("processing origin intrinsic_metadata %r", origin_metadata) origin_metadata = [ {"url": item["id"], "intrinsic_metadata": item["metadata"],} for item in origin_metadata ] search.origin_update(origin_metadata) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index 3272c20..75d09fc 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,69 +1,94 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools from unittest.mock import MagicMock from swh.search.journal_client import process_journal_objects def test_journal_client_origin_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn({"origin": [{"url": "http://foobar.baz"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"},] ) search_mock.reset_mock() worker_fn({"origin": [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},] ) def test_journal_client_origin_visit_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz", "has_visits": True},] ) +def test_journal_client_origin_visit_status_from_journal(): + search_mock = MagicMock() + + worker_fn = functools.partial(process_journal_objects, search=search_mock,) + + worker_fn( + { + "origin_visit_status": [ + {"origin": "http://foobar.baz", "status": "full"} # full visits ok + ] + } + ) + search_mock.origin_update.assert_called_once_with( + [{"url": "http://foobar.baz", "has_visits": True},] + ) + + search_mock.reset_mock() + + # non-full visits are filtered out + worker_fn( + {"origin_visit_status": [{"origin": "http://foobar.baz", "status": "partial"}]} + ) + search_mock.origin_update.assert_not_called() + + def test_journal_client_origin_metadata_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn( { "origin_intrinsic_metadata": [ { "id": "http://foobar.baz", "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, ] )