diff --git a/swh/web/api/tests/views/test_origin.py b/swh/web/api/tests/views/test_origin.py --- a/swh/web/api/tests/views/test_origin.py +++ b/swh/web/api/tests/views/test_origin.py @@ -728,6 +728,41 @@ assert response == expected +def test_api_origin_metadata_search_not_in_idx_storage(api_client, mocker): + """Tests the origin search for results present in swh-search but not + returned by ``origin_intrinsic_metadata_get`` (which happens when results + come from extrinsic metadata). + """ + + mock_idx_storage = mocker.patch("swh.web.utils.archive.idx_storage") + mock_idx_storage.origin_intrinsic_metadata_get.return_value = [] + mock_idx_storage.origin_intrinsic_metadata_search_fulltext.side_effect = ( + AssertionError("origin_intrinsic_metadata_search_fulltext was called") + ) + + mock_config = mocker.patch("swh.web.utils.archive.config") + mock_config.get_config.return_value = { + "search_config": {"metadata_backend": "swh-search"} + } + + url = reverse( + "api-1-origin-metadata-search", + query_params={"fulltext": ORIGIN_METADATA_VALUE}, + ) + rv = check_api_get_responses(api_client, url, status_code=200) + rv.data = sorted(rv.data, key=lambda d: d["url"]) + + expected_data = sorted( + [ + {"url": origin_url, "metadata": {}} + for origin_url in sorted(ORIGIN_MASTER_REVISION.keys()) + ], + key=lambda d: d["url"], + ) + + assert expected_data == rv.data + + def test_api_origin_metadata_search_limit(api_client, mocker): mock_idx_storage = mocker.patch("swh.web.utils.archive.idx_storage") oimsft = mock_idx_storage.origin_intrinsic_metadata_search_fulltext diff --git a/swh/web/utils/archive.py b/swh/web/utils/archive.py --- a/swh/web/utils/archive.py +++ b/swh/web/utils/archive.py @@ -357,18 +357,35 @@ metadata_pattern=fulltext, limit=limit, ) - matches = idx_storage.origin_intrinsic_metadata_get( - [r["url"] for r in page_result.results] - ) + origin_urls = [r["url"] for r in page_result.results] + metadata = { + r.id: r for r in idx_storage.origin_intrinsic_metadata_get(origin_urls) + } + + # Results from swh-search are not guaranteed to be in + # idx_storage.origin_intrinsic_metadata (typically when they come from + # extrinsic metadata; or when the swh-indexer cache is cleared). + # When they are missing, we only return the origin url. + matches = [ + metadata[url].to_dict() if url in metadata else {"id": url} + for url in origin_urls + ] else: - matches = idx_storage.origin_intrinsic_metadata_search_fulltext( - conjunction=[fulltext], limit=limit - ) + matches = [ + match.to_dict() + for match in idx_storage.origin_intrinsic_metadata_search_fulltext( + conjunction=[fulltext], limit=limit + ) + ] - matches = [match.to_dict() for match in matches] origins = storage.origin_get([match["id"] for match in matches]) + for origin, match in zip(origins, matches): if not origin: + # filter out origins not present in the storage, as we do not have any + # meaningful content to display for that origin at the moment. + # This may occur when the storage database we use is lagging behind + # swh-search continue for field in ("from_directory", "from_revision"): # from_directory when using swh.indexer >= 2.0.0, from_revision otherwise