diff --git a/swh/web/api/tests/views/test_origin.py b/swh/web/api/tests/views/test_origin.py --- a/swh/web/api/tests/views/test_origin.py +++ b/swh/web/api/tests/views/test_origin.py @@ -1,9 +1,10 @@ -# Copyright (C) 2015-2021 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta +import itertools import json from hypothesis import given @@ -763,6 +764,38 @@ assert expected_data == rv.data +@pytest.mark.parametrize( + "backend,fields", + itertools.product(["swh-search", "swh-indexer-storage"], ["url", "url,foobar"]), +) +def test_api_origin_metadata_search_url_only(api_client, mocker, backend, fields): + """Checks that idx_storage.origin_intrinsic_metadata_get is not called when + its results are not needed""" + mocker.patch( + "swh.web.utils.archive.idx_storage.origin_intrinsic_metadata_get", + side_effect=AssertionError("origin_intrinsic_metadata_get was called"), + ) + + mock_config = mocker.patch("swh.web.utils.archive.config") + mock_config.get_config.return_value = { + "search_config": {"metadata_backend": backend} + } + + url = reverse( + "api-1-origin-metadata-search", + query_params={"fulltext": ORIGIN_METADATA_VALUE, "fields": fields}, + ) + rv = check_api_get_responses(api_client, url, status_code=200) + rv.data = sorted(rv.data, key=lambda d: d["url"]) + + expected_data = sorted( + [{"url": origin_url} for origin_url in sorted(ORIGIN_MASTER_REVISION.keys())], + key=lambda d: d["url"], + ) + + assert expected_data == rv.data + + def test_api_origin_metadata_search_limit(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.utils.archive.idx_storage") oimsft = mock_idx_storage.origin_intrinsic_metadata_search_fulltext diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -285,12 +285,19 @@ """ fulltext = request.query_params.get("fulltext", None) limit = min(int(request.query_params.get("limit", "70")), 100) + fields = request.query_params.get("fields") if not fulltext: content = '"fulltext" must be provided and non-empty.' raise BadInputExc(content) + return_metadata = not fields or "metadata" in fields + results = api_lookup( - archive.search_origin_metadata, fulltext, limit, request=request + archive.search_origin_metadata, + fulltext, + limit, + return_metadata, + request=request, ) return { diff --git a/swh/web/utils/archive.py b/swh/web/utils/archive.py --- a/swh/web/utils/archive.py +++ b/swh/web/utils/archive.py @@ -336,13 +336,14 @@ def search_origin_metadata( - fulltext: str, limit: int = 50 + fulltext: str, limit: int = 50, return_metadata: bool = True ) -> Iterable[OriginMetadataInfo]: """Search for origins whose metadata match a provided string pattern. Args: fulltext: the string pattern to search for in origin metadata limit: the maximum number of found origins to return + return_metadata: if false, will only return the origin URL Returns: Iterable of origin metadata information for existing origins @@ -358,9 +359,14 @@ limit=limit, ) origin_urls = [r["url"] for r in page_result.results] - metadata = { - r.id: r for r in idx_storage.origin_intrinsic_metadata_get(origin_urls) - } + + if return_metadata: + metadata = { + r.id: r for r in idx_storage.origin_intrinsic_metadata_get(origin_urls) + } + else: + # Skip query to swh-indexer if we are not interested in the results + metadata = {} # Results from swh-search are not guaranteed to be in # idx_storage.origin_intrinsic_metadata (typically when they come from @@ -392,6 +398,8 @@ if field in match: match[field] = hashutil.hash_to_hex(match[field]) del match["id"] + if not return_metadata: + match.pop("metadata", None) results.append(OriginMetadataInfo(url=origin.url, metadata=match)) return results