diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py index b0a3ec2..8592077 100644 --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,43 +1,58 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -MAX_ORIGINS_PER_TASK = 100 - -EXPECTED_MESSAGE_TYPES = {'origin', 'origin_intrinsic_metadata'} +EXPECTED_MESSAGE_TYPES = { + 'origin', 'origin_visit', 'origin_intrinsic_metadata', +} def process_journal_objects(messages, *, search): """Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`.""" assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) if 'origin' in messages: process_origins(messages['origin'], search) + if 'origin_visit' in messages: + process_origin_visits(messages['origin_visit'], search) + if 'origin_intrinsic_metadata' in messages: process_origin_intrinsic_metadata( messages['origin_intrinsic_metadata'], search) def process_origins(origins, search): logging.debug('processing origins %r', origins) search.origin_update(origins) +def process_origin_visits(visits, search): + logging.debug('processing origin visits %r', visits) + + search.origin_update([ + { + 'url': visit['origin']['url'], + 'has_visits': True + } + for visit in visits + ]) + + def process_origin_intrinsic_metadata(origin_metadata, search): logging.debug('processing origin intrinsic_metadata %r', origin_metadata) origin_metadata = [ { 'url': item['origin_url'], 'intrinsic_metadata': item['metadata'], } for item in origin_metadata] search.origin_update(origin_metadata) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index c056605..dcb4566 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,65 +1,82 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import unittest from unittest.mock import MagicMock from swh.search.journal_client import process_journal_objects class SearchJournalClientTest(unittest.TestCase): def test_origin_from_journal(self): search_mock = MagicMock() worker_fn = functools.partial( process_journal_objects, search=search_mock, ) worker_fn({'origin': [ {'url': 'http://foobar.baz'}, ]}) search_mock.origin_update.assert_called_once_with([ {'url': 'http://foobar.baz'}, ]) search_mock.reset_mock() worker_fn({'origin': [ {'url': 'http://foobar.baz'}, {'url': 'http://barbaz.qux'}, ]}) search_mock.origin_update.assert_called_once_with([ {'url': 'http://foobar.baz'}, {'url': 'http://barbaz.qux'}, ]) + def test_origin_visit_from_journal(self): + search_mock = MagicMock() + + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) + + worker_fn({'origin_visit': [ + { + 'origin': {'url': 'http://foobar.baz'}, + } + ]}) + search_mock.origin_update.assert_called_once_with([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + def test_origin_metadata_from_journal(self): search_mock = MagicMock() worker_fn = functools.partial( process_journal_objects, search=search_mock, ) worker_fn({'origin_intrinsic_metadata': [ { 'origin_url': 'http://foobar.baz', 'metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'foo bar', }, }, ]}) search_mock.origin_update.assert_called_once_with([ { 'url': 'http://foobar.baz', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'foo bar', }, }, ])