diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -31,7 +31,12 @@ # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} - for field_name in ("intrinsic_metadata", "has_visits", "visit_types"): + for field_name in ( + "blocklisted", + "has_visits", + "intrinsic_metadata", + "visit_types", + ): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -145,6 +150,8 @@ } }, }, + # Has this origin been taken down? + "blocklisted": {"type": "boolean",}, }, }, ) @@ -275,7 +282,12 @@ query_clauses.append({"terms": {"visit_types": visit_types}}) body = { - "query": {"bool": {"must": query_clauses,}}, + "query": { + "bool": { + "must": query_clauses, + "must_not": [{"term": {"blocklisted": True}}], + } + }, "sort": [{"_score": "desc"}, {"sha1": "asc"},], } if page_token: diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -81,7 +81,9 @@ limit: int = 50, ) -> PagedResult[MinimalOriginDict]: hits: Iterator[Dict[str, Any]] = ( - self._origins[id_] for id_ in self._origin_ids + self._origins[id_] + for id_ in self._origin_ids + if not self._origins[id_].get("blocklisted") ) if url_pattern: diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -578,3 +578,39 @@ self.search.origin_search, metadata_pattern="foo", limit=limit ) assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz] + + def test_search_blocklisted_results(self): + origin1 = {"url": "http://origin1"} + origin2 = {"url": "http://origin2", "blocklisted": True} + + self.search.origin_update([origin1, origin2]) + self.search.flush() + + actual_page = self.search.origin_search(url_pattern="origin") + assert actual_page.next_page_token is None + assert actual_page.results == [origin1] + + def test_search_blocklisted_update(self): + origin1 = {"url": "http://origin1"} + self.search.origin_update([origin1]) + self.search.flush() + + result_page = self.search.origin_search(url_pattern="origin") + assert result_page.next_page_token is None + assert result_page.results == [origin1] + + self.search.origin_update([{**origin1, "blocklisted": True}]) + self.search.flush() + + result_page = self.search.origin_search(url_pattern="origin") + assert result_page.next_page_token is None + assert result_page.results == [] + + self.search.origin_update( + [{**origin1, "has_visits": True, "visit_types": ["git"]}] + ) + self.search.flush() + + result_page = self.search.origin_search(url_pattern="origin") + assert result_page.next_page_token is None + assert result_page.results == []