diff --git a/PKG-INFO b/PKG-INFO index 722f6b7..6c7950e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,46 +1,46 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.3.3 +Version: 0.3.4 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Description: swh-search ========== Search service for the Software Heritage archive. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) # Dependencies Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). ## Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) ## Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.search.egg-info/PKG-INFO b/swh.search.egg-info/PKG-INFO index 722f6b7..6c7950e 100644 --- a/swh.search.egg-info/PKG-INFO +++ b/swh.search.egg-info/PKG-INFO @@ -1,46 +1,46 @@ Metadata-Version: 2.1 Name: swh.search -Version: 0.3.3 +Version: 0.3.4 Summary: Software Heritage search service Home-page: https://forge.softwareheritage.org/diffusion/DSEA Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-search Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-search/ Description: swh-search ========== Search service for the Software Heritage archive. Currently uses ElasticSearch, and provides only origin search (by URL and metadata) # Dependencies Python tests for this module include tests that cannot be run without a local ElasticSearch instance, so you need the ElasticSearch server executable on your machine (no need to have a running ElasticSearch server). ## Debian-like host The elasticsearch package is required. As it's not part of debian-stable, [another debian repository is required to be configured](https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html#deb-repo) ## Non Debian-like host The tests expect: - `/usr/share/elasticsearch/jdk/bin/java` to exist. - `org.elasticsearch.bootstrap.Elasticsearch` to be in java's classpath. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py index 5a7d793..236f428 100644 --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,77 +1,79 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging EXPECTED_MESSAGE_TYPES = { "origin", "origin_visit", "origin_visit_status", "origin_intrinsic_metadata", } def process_journal_objects(messages, *, search): """Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`.""" assert set(messages) <= EXPECTED_MESSAGE_TYPES, set(messages) if "origin" in messages: process_origins(messages["origin"], search) if "origin_visit" in messages: process_origin_visits(messages["origin_visit"], search) if "origin_visit_status" in messages: - process_origin_visits(messages["origin_visit_status"], search) + process_origin_visit_statuses(messages["origin_visit_status"], search) if "origin_intrinsic_metadata" in messages: process_origin_intrinsic_metadata(messages["origin_intrinsic_metadata"], search) def process_origins(origins, search): logging.debug("processing origins %r", origins) search.origin_update(origins) def process_origin_visits(visits, search): logging.debug("processing origin visits %r", visits) search.origin_update( [ { "url": ( visit["origin"] if isinstance(visit["origin"], str) else visit["origin"]["url"] ), "has_visits": True, } for visit in visits ] ) def process_origin_visit_statuses(visit_statuses, search): logging.debug("processing origin visit statuses %r", visit_statuses) - search.origin_update( - [ - {"url": (visit_status["origin"]), "has_visits": True,} - for visit_status in visit_statuses - ] - ) + full_visit_status = [ + {"url": (visit_status["origin"]), "has_visits": True,} + for visit_status in visit_statuses + if visit_status["status"] == "full" + ] + + if full_visit_status: + search.origin_update(full_visit_status) def process_origin_intrinsic_metadata(origin_metadata, search): logging.debug("processing origin intrinsic_metadata %r", origin_metadata) origin_metadata = [ {"url": item["id"], "intrinsic_metadata": item["metadata"],} for item in origin_metadata ] search.origin_update(origin_metadata) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py index 3272c20..75d09fc 100644 --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,69 +1,94 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools from unittest.mock import MagicMock from swh.search.journal_client import process_journal_objects def test_journal_client_origin_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn({"origin": [{"url": "http://foobar.baz"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"},] ) search_mock.reset_mock() worker_fn({"origin": [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz"}, {"url": "http://barbaz.qux"},] ) def test_journal_client_origin_visit_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]}) search_mock.origin_update.assert_called_once_with( [{"url": "http://foobar.baz", "has_visits": True},] ) +def test_journal_client_origin_visit_status_from_journal(): + search_mock = MagicMock() + + worker_fn = functools.partial(process_journal_objects, search=search_mock,) + + worker_fn( + { + "origin_visit_status": [ + {"origin": "http://foobar.baz", "status": "full"} # full visits ok + ] + } + ) + search_mock.origin_update.assert_called_once_with( + [{"url": "http://foobar.baz", "has_visits": True},] + ) + + search_mock.reset_mock() + + # non-full visits are filtered out + worker_fn( + {"origin_visit_status": [{"origin": "http://foobar.baz", "status": "partial"}]} + ) + search_mock.origin_update.assert_not_called() + + def test_journal_client_origin_metadata_from_journal(): search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) worker_fn( { "origin_intrinsic_metadata": [ { "id": "http://foobar.baz", "metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, ] } ) search_mock.origin_update.assert_called_once_with( [ { "url": "http://foobar.baz", "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, ] )