diff --git a/config/dev.yml b/config/dev.yml --- a/config/dev.yml +++ b/config/dev.yml @@ -2,6 +2,10 @@ cls: remote url: http://moma.internal.softwareheritage.org:5002 +search: + cls: remote + url: http://moma.internal.softwareheritage.org:5010 + debug: yes server-type: asgi diff --git a/config/staging.yml b/config/staging.yml --- a/config/staging.yml +++ b/config/staging.yml @@ -2,6 +2,10 @@ cls: remote url: http://webapp.internal.staging.swh.network:5002 +search: + cls: remote + url: http://webapp.internal.staging.swh.network:5010 + debug: yes server-type: wsgi diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -2,3 +2,4 @@ swh.core[http] >= 0.3 # [http] is required by swh.core.pytest_plugin swh.storage swh.model +swh.search diff --git a/swh/graphql/backends/archive.py b/swh/graphql/backends/archive.py --- a/swh/graphql/backends/archive.py +++ b/swh/graphql/backends/archive.py @@ -14,15 +14,8 @@ def get_origin(self, url): return self.storage.origin_get([url])[0] - def get_origins(self, after=None, first=50, url_pattern=None): - # STORAGE-TODO - # Make them a single function in the backend - if url_pattern is None: - return self.storage.origin_list(page_token=after, limit=first) - - return self.storage.origin_search( - url_pattern=url_pattern, page_token=after, limit=first - ) + def get_origins(self, after=None, first=50): + return self.storage.origin_list(page_token=after, limit=first) def get_origin_visits(self, origin_url, after=None, first=50): return self.storage.origin_visit_get(origin_url, page_token=after, limit=first) diff --git a/swh/graphql/backends/search.py b/swh/graphql/backends/search.py new file mode 100644 --- /dev/null +++ b/swh/graphql/backends/search.py @@ -0,0 +1,18 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.graphql import server + + +class Search: + def __init__(self): + self.search = server.get_search() + + def get_origins(self, query: str, after=None, first=50): + return self.search.origin_search( + url_pattern=query, + page_token=after, + limit=first, + ) diff --git a/swh/graphql/resolvers/origin.py b/swh/graphql/resolvers/origin.py --- a/swh/graphql/resolvers/origin.py +++ b/swh/graphql/resolvers/origin.py @@ -3,14 +3,23 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.graphql.backends import archive +from swh.graphql.backends import archive, search +from swh.model.model import Origin from swh.storage.interface import PagedResult from .base_connection import BaseConnection from .base_node import BaseSWHNode +from .search import SearchResultNode -class OriginNode(BaseSWHNode): +class BaseOriginNode(BaseSWHNode): + def is_type_of(self): + # is_type_of is required only when resolving a UNION type + # This is for ariadne to return the right type + return "Origin" + + +class OriginNode(BaseOriginNode): """ Node resolver for an origin requested directly with its URL """ @@ -19,16 +28,38 @@ return archive.Archive().get_origin(self.kwargs.get("url")) +class TargetOriginNode(BaseOriginNode): + """ + Node resolver for an origin requested as a target + """ + + obj: SearchResultNode + + def _get_node_data(self): + # The target origin URL is guaranteed to exist in the archive + # Hence returning the origin object without any explicit check in the archive + # This assumes that the search index and archive are in sync + return Origin(self.obj.target_url) + + class OriginConnection(BaseConnection): """ Connection resolver for the origins """ - _node_class = OriginNode + _node_class = BaseOriginNode def _get_paged_result(self) -> PagedResult: + # Use the search backend if a urlPattern is given + if self.kwargs.get("urlPattern"): + origins = search.Search().get_origins( + query=self.kwargs.get("urlPattern"), + after=self._get_after_arg(), + first=self._get_first_arg(), + ) + results = [Origin(ori["url"]) for ori in origins.results] + return PagedResult(results=results, next_page_token=origins.next_page_token) + # Use the archive backend by default return archive.Archive().get_origins( - after=self._get_after_arg(), - first=self._get_first_arg(), - url_pattern=self.kwargs.get("urlPattern"), + after=self._get_after_arg(), first=self._get_first_arg() ) diff --git a/swh/graphql/resolvers/resolver_factory.py b/swh/graphql/resolvers/resolver_factory.py --- a/swh/graphql/resolvers/resolver_factory.py +++ b/swh/graphql/resolvers/resolver_factory.py @@ -6,7 +6,7 @@ from .content import ContentNode, HashContentNode, TargetContentNode from .directory import DirectoryNode, RevisionDirectoryNode, TargetDirectoryNode from .directory_entry import DirectoryEntryConnection -from .origin import OriginConnection, OriginNode +from .origin import OriginConnection, OriginNode, TargetOriginNode from .release import ReleaseNode, TargetReleaseNode from .revision import ( LogRevisionConnection, @@ -14,7 +14,7 @@ RevisionNode, TargetRevisionNode, ) -from .search import ResolveSwhidConnection +from .search import ResolveSwhidConnection, SearchConnection from .snapshot import ( OriginSnapshotConnection, SnapshotNode, @@ -52,6 +52,7 @@ "content-by-hash": HashContentNode, "dir-entry-dir": TargetDirectoryNode, "dir-entry-file": TargetContentNode, + "search-result-origin": TargetOriginNode, "search-result-snapshot": TargetSnapshotNode, "search-result-revision": TargetRevisionNode, "search-result-release": TargetReleaseNode, @@ -75,6 +76,7 @@ "revision-log": LogRevisionConnection, "directory-entries": DirectoryEntryConnection, "resolve-swhid": ResolveSwhidConnection, + "search": SearchConnection, } if resolver_type not in mapping: raise AttributeError(f"Invalid connection type: {resolver_type}") diff --git a/swh/graphql/resolvers/resolvers.py b/swh/graphql/resolvers/resolvers.py --- a/swh/graphql/resolvers/resolvers.py +++ b/swh/graphql/resolvers/resolvers.py @@ -266,6 +266,14 @@ return resolver(obj, info, **kw) +@query.field("search") +def search_resolver( + obj, info: GraphQLResolveInfo, **kw +) -> rs.search.ResolveSwhidConnection: + resolver = get_connection_resolver("search") + return resolver(obj, info, **kw) + + # Any other type of resolver diff --git a/swh/graphql/resolvers/search.py b/swh/graphql/resolvers/search.py --- a/swh/graphql/resolvers/search.py +++ b/swh/graphql/resolvers/search.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.graphql.backends import archive +from swh.graphql.backends import archive, search from swh.storage.interface import PagedResult from .base_connection import BaseConnection @@ -29,3 +29,21 @@ } ] return PagedResult(results=results) + + +class SearchConnection(BaseConnection): + + _node_class = SearchResultNode + + def _get_paged_result(self) -> PagedResult: + origins = search.Search().get_origins( + query=self.kwargs.get("query"), + after=self._get_after_arg(), + first=self._get_first_arg(), + ) + + # FIXME hard coding type to origin for now, as it is the only searchable object + results = [ + {"target_url": ori["url"], "type": "origin"} for ori in origins.results + ] + return PagedResult(results=results, next_page_token=origins.next_page_token) diff --git a/swh/graphql/schema/schema.graphql b/swh/graphql/schema/schema.graphql --- a/swh/graphql/schema/schema.graphql +++ b/swh/graphql/schema/schema.graphql @@ -1040,4 +1040,24 @@ """ swhid: SWHID! ): SearchResultConnection! + + """ + Search in SWH + """ + search( + """ + String to search for + """ + query: String! + + """ + Returns the first _n_ elements from the list + """ + first: Int! + + """ + Returns the page after the cursor + """ + after: String + ): SearchResultConnection! } diff --git a/swh/graphql/server.py b/swh/graphql/server.py --- a/swh/graphql/server.py +++ b/swh/graphql/server.py @@ -7,19 +7,28 @@ from typing import Any, Dict, Optional from swh.core import config -from swh.storage import get_storage as get_swhstorage +from swh.search import get_search as get_swh_search +from swh.storage import get_storage as get_swh_storage graphql_cfg = None storage = None +search = None def get_storage(): global storage if not storage: - storage = get_swhstorage(**graphql_cfg["storage"]) + storage = get_swh_storage(**graphql_cfg["storage"]) return storage +def get_search(): + global search + if not search: + search = get_swh_search(**graphql_cfg["search"]) + return search + + def load_and_check_config(config_path: Optional[str]) -> Dict[str, Any]: """Check the minimal configuration is set to run the api or raise an error explanation. diff --git a/swh/graphql/tests/conftest.py b/swh/graphql/tests/conftest.py --- a/swh/graphql/tests/conftest.py +++ b/swh/graphql/tests/conftest.py @@ -10,14 +10,15 @@ from swh.graphql import server as app_server from swh.graphql.app import schema -from swh.storage import get_storage as get_swhstorage +from swh.search import get_search as get_swh_search +from swh.storage import get_storage as get_swh_storage -from .data import populate_dummy_data +from .data import populate_dummy_data, populate_search_data @pytest.fixture def storage(): - storage = get_swhstorage(cls="memory") + storage = get_swh_storage(cls="memory") # set the global var to use the in-memory storage app_server.storage = storage # populate the in-memory storage @@ -26,7 +27,18 @@ @pytest.fixture -def test_app(storage): +def search(): + search = get_swh_search("memory") + # set the global var to use the in-memory search + app_server.search = search + search.initialize() + # populate the in-memory search + populate_search_data(search) + return search + + +@pytest.fixture +def test_app(storage, search): app = Flask(__name__) @app.route("/", methods=["POST"]) diff --git a/swh/graphql/tests/data.py b/swh/graphql/tests/data.py --- a/swh/graphql/tests/data.py +++ b/swh/graphql/tests/data.py @@ -12,6 +12,10 @@ method(objects) +def populate_search_data(search): + search.origin_update({"url": origin.url} for origin in get_origins()) + + def get_origins(): return swh_model_data.ORIGINS diff --git a/swh/graphql/tests/functional/test_search.py b/swh/graphql/tests/functional/test_search.py new file mode 100644 --- /dev/null +++ b/swh/graphql/tests/functional/test_search.py @@ -0,0 +1,64 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from . import utils + + +def test_search_origins(client): + query_str = """ + { + search(query: "fox", first: 1) { + nodes { + type + target { + ...on Origin { + url + latestVisit { + date + } + } + } + } + pageInfo { + hasNextPage + endCursor + } + } + } + """ + data, _ = utils.get_query_response(client, query_str) + assert len(data["search"]["nodes"]) == 1 + assert data == { + "search": { + "nodes": [ + { + "target": { + "url": "https://somewhere.org/den/fox", + "latestVisit": {"date": "2018-11-27T17:20:39+00:00"}, + }, + "type": "origin", + } + ], + "pageInfo": {"endCursor": "MQ==", "hasNextPage": True}, + } + } + + +def test_search_missing_url(client): + query_str = """ + { + search(query: "missing-fox", first: 1) { + nodes { + type + } + pageInfo { + hasNextPage + endCursor + } + } + } + """ + data, _ = utils.get_query_response(client, query_str) + assert len(data["search"]["nodes"]) == 0