diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -36,6 +36,8 @@ "has_visits", "intrinsic_metadata", "visit_types", + "nb_visit", + "last_visit_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -171,6 +173,9 @@ update_script = """ // backup current visit_types field value List visit_types = ctx._source.getOrDefault("visit_types", []); + int nb_visit = ctx._source.getOrDefault("nb_visit", 0); + ZonedDateTime last_visit_date = ZonedDateTime.parse( + ctx._source.getOrDefault("last_visit_date", "")); // update origin document with new field values ctx._source.putAll(params); @@ -183,6 +188,23 @@ } } } + + // Undo overwrite if incoming nb_visit is smaller + if (ctx._source.containsKey("nb_visit")) { + int incoming_nb_visit = ctx._source.getOrDefault("nb_visit", ""); + if(incoming_nb_visit < nb_visit){ + ctx._source.nb_visit = nb_visit; + } + } + + // Undo overwrite if incoming last_visit_date is older + if (ctx._source.containsKey("last_visit_date")) { + ZonedDateTime incoming_last_visit_date = ZonedDateTime.parse( + ctx._source.getOrDefault("last_visit_date", "")); + if(incoming_last_visit_date < last_visit_date){ + ctx._source.last_visit_date = last_visit_date; + } + } """ actions = [ diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -59,7 +59,12 @@ logging.debug("processing origin visit statuses %r", visit_statuses) full_visit_status = [ - {"url": (visit_status["origin"]), "has_visits": True,} + { + "url": (visit_status["origin"]), + "has_visits": True, + "nb_visit": visit_status["visit"], + "last_visit_date": visit_status["date"].isoformat(), + } for visit_status in visit_statuses if visit_status["status"] == "full" ] diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import copy +from datetime import datetime import tempfile from click.testing import CliRunner @@ -179,6 +180,7 @@ { "origin": origin_foobar["url"], "visit": 1, + "date": str(datetime.now()), "snapshot": None, "status": "full", } diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime import functools from unittest.mock import MagicMock @@ -42,23 +43,45 @@ search_mock = MagicMock() worker_fn = functools.partial(process_journal_objects, search=search_mock,) + current_datetime = datetime.now() worker_fn( { "origin_visit_status": [ - {"origin": "http://foobar.baz", "status": "full"} # full visits ok + { + "origin": "http://foobar.baz", + "status": "full", + "visit": 5, + "date": current_datetime, + } # full visits ok ] } ) search_mock.origin_update.assert_called_once_with( - [{"url": "http://foobar.baz", "has_visits": True},] + [ + { + "url": "http://foobar.baz", + "has_visits": True, + "nb_visit": 5, + "last_visit_date": str(current_datetime), + }, + ] ) search_mock.reset_mock() # non-full visits are filtered out worker_fn( - {"origin_visit_status": [{"origin": "http://foobar.baz", "status": "partial"}]} + { + "origin_visit_status": [ + { + "origin": "http://foobar.baz", + "status": "partial", + "visit": 5, + "date": current_datetime, + } + ] + } ) search_mock.origin_update.assert_not_called()