diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -41,6 +41,7 @@ "snapshot_id", "last_visit_date", "last_eventful_visit_date", + "last_revision_date", ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -149,6 +150,8 @@ "snapshot_id": {"type": "keyword"}, "last_visit_date": {"type": "date"}, "last_eventful_visit_date": {"type": "date"}, + "last_release_date": {"type": "date"}, + "last_revision_date": {"type": "date"}, "intrinsic_metadata": { "type": "nested", "properties": { @@ -185,6 +188,7 @@ ZonedDateTime last_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_visit_date", "0001-01-01T00:00:00Z")); String snapshot_id = ctx._source.getOrDefault("snapshot_id", ""); ZonedDateTime last_eventful_visit_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_eventful_visit_date", "0001-01-01T00:00:00Z")); + ZonedDateTime last_revision_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_revision_date", "0001-01-01T00:00:00Z")); // update origin document with new field values ctx._source.putAll(params); @@ -226,6 +230,15 @@ ctx._source.last_eventful_visit_date = last_eventful_visit_date; } } + + // Undo overwrite if incoming last_revision_date is older + if (ctx._source.containsKey("last_revision_date")) { + ZonedDateTime incoming_last_revision_date = ZonedDateTime.parse(ctx._source.getOrDefault("last_revision_date", "0001-01-01T00:00:00Z")); + int difference = incoming_last_revision_date.compareTo(last_revision_date); // returns -1, 0 or 1 + if(difference < 0){ + ctx._source.last_revision_date = last_revision_date; + } + } """ # noqa ) @@ -271,6 +284,7 @@ min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", + min_last_revision_date: str = "", page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[MinimalOriginDict]: @@ -346,6 +360,16 @@ } } ) + if min_last_revision_date: + query_clauses.append( + { + "range": { + "last_revision_date": { + "gte": min_last_revision_date.replace("Z", "+00:00"), + } + } + } + ) if visit_types is not None: query_clauses.append({"terms": {"visit_types": visit_types}}) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -100,6 +100,24 @@ document["snapshot_id"] = current_snapshot_id document["last_eventful_visit_date"] = current_date.isoformat() + if "last_revision_date" in document: + document["last_revision_date"] = max( + datetime.fromisoformat(document["last_revision_date"]), + datetime.fromisoformat( + self._origins[id_] + .get("last_revision_date", "0001-01-01T00:00:00Z",) + .replace("Z", "+00:00") + ), + ).isoformat() + if "last_release_date" in document: + document["last_release_date"] = max( + datetime.fromisoformat(document["last_release_date"]), + datetime.fromisoformat( + self._origins[id_] + .get("last_release_date", "0001-01-01T00:00:00Z",) + .replace("Z", "+00:00") + ), + ).isoformat() self._origins[id_].update(document) if id_ not in self._origin_ids: @@ -116,6 +134,7 @@ min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", + min_last_revision_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( @@ -191,6 +210,17 @@ hits, ) + if min_last_revision_date: + hits = filter( + lambda o: datetime.fromisoformat( + o.get("last_revision_date", "0001-01-01T00:00:00Z").replace( + "Z", "+00:00" + ) + ) + >= datetime.fromisoformat(min_last_revision_date), + hits, + ) + if visit_types is not None: visit_types_set = set(visit_types) hits = filter( diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -62,6 +62,7 @@ min_nb_visits: int = 0, min_last_visit_date: str = "", min_last_eventful_visit_date: str = "", + min_last_revision_date: str = "", limit: int = 50, ) -> PagedResult[MinimalOriginDict]: """Searches for origins matching the `url_pattern`. @@ -80,6 +81,8 @@ min_last_eventful_visit_date: Filter origins that have last_eventful_visit_date (eventful = snapshot_id changed) on or after the provided date(ISO format) + min_last_revision_date: Filter origins that have + last_revision_date on or after the provided date(ISO format) limit: number of results to return Returns: diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -3,7 +3,12 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime import logging +from typing import List + +from swh.model.model import TargetType +from swh.storage import get_storage EXPECTED_MESSAGE_TYPES = { "origin", @@ -12,6 +17,13 @@ "origin_intrinsic_metadata", } +DEFAULT_STORAGE_CONFIG = { + "url": "http://127.0.0.1:5002/", + "timeout": 10, +} + +swh_storage = get_storage(cls="remote", **DEFAULT_STORAGE_CONFIG) + def process_journal_objects(messages, *, search): """Worker function for `JournalClient.process(worker_fn)`, after @@ -58,6 +70,49 @@ def process_origin_visit_statuses(visit_statuses, search): logging.debug("processing origin visit statuses %r", visit_statuses) + def convert_to_datetime(timestamps) -> List[datetime]: + return [datetime.fromtimestamp(t["timestamp"]["seconds"]) for t in timestamps] + + def fetch_last_revision_date(snapshot_id): + + if snapshot_id: + branches = [ + partial_branch.branch + for partial_branch in swh_storage.snapshot_get_branches(snapshot_id) + ] + tip_revision_ids = [ + snapshot_branch.id + for _, snapshot_branch in branches + if snapshot_branch.type == TargetType.REVISION + ] + revision_timestamps_with_timezone = [ + revision.date for revision in swh_storage.revision_get(tip_revision_ids) + ] + revision_datetimes = convert_to_datetime(revision_timestamps_with_timezone) + return max(revision_datetimes).isoformat() + + return "0001-01-01T00:00:00Z" + + def fetch_last_release_date(snapshot_id): + + if snapshot_id: + branches = [ + partial_branch.branch + for partial_branch in swh_storage.snapshot_get_branches(snapshot_id) + ] + tip_release_ids = [ + snapshot_branch.id + for _, snapshot_branch in branches + if snapshot_branch.type == TargetType.RELEASE + ] + release_timestamps_with_timezone = [ + release.date for release in swh_storage.release_get(tip_release_ids) + ] + release_datetimes = convert_to_datetime(release_timestamps_with_timezone) + return max(release_datetimes).isoformat() + + return "0001-01-01T00:00:00Z" + full_visit_status = [ { "url": (visit_status["origin"]), @@ -66,6 +121,12 @@ "snapshot_id": visit_status.get("snapshot"), "last_visit_date": visit_status["date"].isoformat(), "last_eventful_visit_date": visit_status["date"].isoformat(), + "last_revision_date": fetch_last_revision_date( + snapshot_id=visit_status.get("snapshot") + ), + "last_release_date": fetch_last_release_date( + snapshot_id=visit_status.get("snapshot") + ), } for visit_status in visit_statuses if visit_status["status"] == "full" diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -67,6 +67,8 @@ "snapshot_id": None, "last_visit_date": current_datetime.isoformat(), "last_eventful_visit_date": current_datetime.isoformat(), + "last_revision_date": "0001-01-01T00:00:00Z", + "last_release_date": "0001-01-01T00:00:00Z", }, ] ) diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -361,6 +361,46 @@ _check_min_last_eventful_visit_date(now_plus_5_hours) # Works for = _check_min_last_eventful_visit_date(now) # Works for < + def test_origin_last_revision_date_update_search(self): + origin_url = "http://foobar.baz" + self.search.origin_update([{"url": origin_url}]) + self.search.flush() + + def _update_last_revision_date(last_revision_date): + self.search.origin_update( + [{"url": origin_url, "last_revision_date": last_revision_date,}] + ) + self.search.flush() + + def _check_min_last_revision_date(min_last_revision_date): + actual_page = self.search.origin_search( + url_pattern=origin_url, min_last_revision_date=min_last_revision_date, + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + now = datetime.now(tz=timezone.utc).isoformat() + now_minus_5_hours = ( + datetime.now(tz=timezone.utc) - timedelta(hours=5) + ).isoformat() + now_plus_5_hours = ( + datetime.now(tz=timezone.utc) + timedelta(hours=5) + ).isoformat() + + _update_last_revision_date(now) + + _check_min_last_revision_date(now) + _check_min_last_revision_date(now_minus_5_hours) + with pytest.raises(AssertionError): + _check_min_last_revision_date(now_plus_5_hours) + + _update_last_revision_date(now_plus_5_hours) + + _check_min_last_revision_date(now_plus_5_hours) + _check_min_last_revision_date(now) + def test_origin_update_with_no_visit_types(self): """ Update an origin with visit types first then with no visit types,