diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,7 +21,7 @@ # Whitelist fields to be saved in Elasticsearch res = {"url": origin.pop("url")} - for field_name in ("intrinsic_metadata", "has_visits"): + for field_name in ("intrinsic_metadata", "has_visits", "visit_types"): if field_name in origin: res[field_name] = origin.pop(field_name) @@ -103,6 +103,7 @@ } }, }, + "visit_types": {"type": "keyword"}, # used to filter out origins that were never visited "has_visits": {"type": "boolean",}, "intrinsic_metadata": { @@ -129,13 +130,34 @@ documents_with_sha1 = ( (origin_identifier(document), document) for document in documents ) + # painless script that will be executed when updating an origin document + update_script = """ + // backup current visit_types field value + List visit_types = ctx._source.getOrDefault("visit_types", []); + // update origin document with new field values + ctx._source.putAll(params); + // restore previous visit types after visit_types field overriding + if (ctx._source.containsKey("visit_types")) { + for (int i = 0; i < visit_types.length; ++i) { + if (!ctx._source.visit_types.contains(visit_types[i])) { + ctx._source.visit_types.add(visit_types[i]); + } + } + } + """ + actions = [ { "_op_type": "update", "_id": sha1, "_index": self.origin_index, - "doc": {**document, "sha1": sha1,}, - "doc_as_upsert": True, + "scripted_upsert": True, + "upsert": {**document, "sha1": sha1,}, + "script": { + "source": update_script, + "lang": "painless", + "params": document, + }, } for (sha1, document) in documents_with_sha1 ] @@ -164,6 +186,7 @@ url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, + visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[Dict[str, Any]]: @@ -218,6 +241,9 @@ if with_visit: query_clauses.append({"term": {"has_visits": True,}}) + if visit_types is not None: + query_clauses.append({"terms": {"visit_types": visit_types}}) + body = { "query": {"bool": {"must": query_clauses,}}, "sort": [{"_score": "desc"}, {"sha1": "asc"},], diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -59,7 +59,12 @@ id_ = origin_identifier(document) if "url" in document: document["_url_tokens"] = set(self._url_splitter.split(document["url"])) + if "visit_types" in document: + document["visit_types"] = set(document["visit_types"]) + if "visit_types" in self._origins[id_]: + document["visit_types"].update(self._origins[id_]["visit_types"]) self._origins[id_].update(document) + if id_ not in self._origin_ids: self._origin_ids.append(id_) @@ -69,6 +74,7 @@ url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, + visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[Dict[str, Any]]: @@ -120,6 +126,13 @@ if with_visit: hits = filter(lambda o: o.get("has_visits"), hits) + if visit_types is not None: + visit_types_set = set(visit_types) + hits = filter( + lambda o: visit_types_set.intersection(o.get("visit_types", set())), + hits, + ) + start_at_index = int(page_token) if page_token else 0 origins = [ diff --git a/swh/search/interface.py b/swh/search/interface.py --- a/swh/search/interface.py +++ b/swh/search/interface.py @@ -1,9 +1,9 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, Iterable, Optional, TypeVar +from typing import Any, Dict, Iterable, List, Optional, TypeVar from swh.core.api import remote_api_endpoint from swh.core.api.classes import PagedResult as CorePagedResult @@ -42,6 +42,7 @@ url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, + visit_types: Optional[List[str]] = None, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[Dict[str, Any]]: @@ -51,6 +52,8 @@ url_pattern: Part of the URL to search for with_visit: Whether origins with no visit are to be filtered out + visit_types: Only origins having any of the provided visit types + (e.g. git, svn, pypi) will be returned page_token: Opaque value used for pagination limit: number of results to return diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -48,6 +48,7 @@ if isinstance(visit["origin"], str) else visit["origin"]["url"] ), + "visit_types": [visit["type"]], } for visit in visits ] diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -19,10 +19,9 @@ CLI_CONFIG = """ search: cls: elasticsearch - args: - hosts: - - '%(elasticsearch_host)s' - index_prefix: test + hosts: + - '%(elasticsearch_host)s' + index_prefix: test """ JOURNAL_OBJECTS_CONFIG_TEMPLATE = """ @@ -120,7 +119,7 @@ } ) topic = f"{kafka_prefix}.origin_visit" - value = value_to_kafka({"origin": origin_foobar["url"]}) + value = value_to_kafka({"origin": origin_foobar["url"], "type": "git"}) producer.produce(topic=topic, key=b"bogus-origin-visit", value=value) journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format( diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -32,9 +32,9 @@ worker_fn = functools.partial(process_journal_objects, search=search_mock,) - worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]}) + worker_fn({"origin_visit": [{"origin": "http://foobar.baz", "type": "git"},]}) search_mock.origin_update.assert_called_once_with( - [{"url": "http://foobar.baz"},] + [{"url": "http://foobar.baz", "visit_types": ["git"]},] ) diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -101,6 +101,89 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] + def test_origin_no_visit_types_search(self): + origins = [{"url": "http://foobar.baz"}] + + self.search.origin_update(origins) + self.search.flush() + + actual_page = self.search.origin_search(url_pattern="http", visit_types=["git"]) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [] + assert sorted(results) == sorted(expected_results) + + actual_page = self.search.origin_search(url_pattern="http", visit_types=None) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin["url"] for origin in origins] + assert sorted(results) == sorted(expected_results) + + def test_origin_visit_types_search(self): + origins = [ + {"url": "http://foobar.baz", "visit_types": ["git"]}, + {"url": "http://barbaz.qux", "visit_types": ["svn"]}, + {"url": "http://qux.quux", "visit_types": ["hg"]}, + ] + + self.search.origin_update(origins) + self.search.flush() + + for origin in origins: + actual_page = self.search.origin_search( + url_pattern="http", visit_types=origin["visit_types"] + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin["url"]] + assert sorted(results) == sorted(expected_results) + + actual_page = self.search.origin_search(url_pattern="http", visit_types=None) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin["url"] for origin in origins] + assert sorted(results) == sorted(expected_results) + + def test_origin_visit_types_update_search(self): + origin_url = "http://foobar.baz" + self.search.origin_update([{"url": origin_url}]) + self.search.flush() + + def _update_and_check_visit_types(new_visit_type, visit_types_list): + self.search.origin_update( + [{"url": origin_url, "visit_types": [new_visit_type]}] + ) + self.search.flush() + + for visit_types in visit_types_list: + actual_page = self.search.origin_search( + url_pattern="http", visit_types=visit_types + ) + assert actual_page.next_page_token is None + results = [r["url"] for r in actual_page.results] + expected_results = [origin_url] + assert sorted(results) == sorted(expected_results) + + _update_and_check_visit_types( + new_visit_type="git", visit_types_list=[["git"], ["git", "hg"]] + ) + _update_and_check_visit_types( + new_visit_type="svn", + visit_types_list=[["git"], ["svn"], ["svn", "git"], ["git", "hg", "svn"]], + ) + _update_and_check_visit_types( + new_visit_type="hg", + visit_types_list=[ + ["git"], + ["svn"], + ["hg"], + ["svn", "git"], + ["hg", "git"], + ["hg", "svn"], + ["git", "hg", "svn"], + ], + ) + def test_origin_intrinsic_metadata_description(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} @@ -362,6 +445,32 @@ assert actual_page.next_page_token is None assert actual_page.results == [origin2] + def test_origin_intrinsic_metadata_update(self): + origin = {"url": "http://origin1"} + origin_data = { + **origin, + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "author": "John Doe", + }, + } + + self.search.origin_update([origin_data]) + self.search.flush() + + actual_page = self.search.origin_search(metadata_pattern="John") + assert actual_page.next_page_token is None + assert actual_page.results == [origin] + + origin_data["intrinsic_metadata"]["author"] = "Jane Doe" + + self.search.origin_update([origin_data]) + self.search.flush() + + actual_page = self.search.origin_search(metadata_pattern="Jane") + assert actual_page.next_page_token is None + assert actual_page.results == [origin] + # TODO: add more tests with more codemeta terms # TODO: add more tests with edge cases