diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ # Add here internal Software Heritage dependencies, one per line. -swh.core[http] +swh.core[http] >= 0.2.0 swh.journal >= 0.1.0 swh.model diff --git a/swh/search/api/client.py b/swh/search/api/client.py --- a/swh/search/api/client.py +++ b/swh/search/api/client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,9 +6,13 @@ from swh.core.api import RPCClient from ..elasticsearch import ElasticSearch +from .serializers import DECODERS, ENCODERS class RemoteSearch(RPCClient): """Proxy to a remote search API""" + extra_type_decoders = DECODERS + extra_type_encoders = ENCODERS + backend_class = ElasticSearch diff --git a/swh/search/api/serializers.py b/swh/search/api/serializers.py new file mode 100644 --- /dev/null +++ b/swh/search/api/serializers.py @@ -0,0 +1,30 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Extra decoder(s)/encoder(s) for swh-model objects.""" + +from typing import Callable, Dict, List, Tuple + +import swh.model.model as model + + +def _encode_model_object(obj): + d = obj.to_dict() + d["__type__"] = type(obj).__name__ + return d + + +def _decode_model_object(d): + return getattr(model, d.pop("__type__")).from_dict(d) + + +ENCODERS: List[Tuple[type, str, Callable]] = [ + (model.BaseModel, "model", _encode_model_object), +] + + +DECODERS: Dict[str, Callable] = { + "model": _decode_model_object, +} diff --git a/swh/search/api/server.py b/swh/search/api/server.py --- a/swh/search/api/server.py +++ b/swh/search/api/server.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ from .. import get_search from ..elasticsearch import ElasticSearch +from .serializers import DECODERS, ENCODERS def _get_search(): @@ -21,8 +22,14 @@ return search -app = RPCServerApp(__name__, backend_class=ElasticSearch, backend_factory=_get_search) +class SearchServerApp(RPCServerApp): + extra_type_decoders = DECODERS + extra_type_encoders = ENCODERS + +app = SearchServerApp( + __name__, backend_class=ElasticSearch, backend_factory=_get_search +) search = None diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,19 +1,23 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 + +import msgpack + from typing import Any, Iterable, Dict, List, Iterator, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk, scan -import msgpack from swh.core.api import remote_api_endpoint from swh.model import model from swh.model.identifiers import origin_identifier +from swh.search.interface import PagedResult + def _sanitize_origin(origin): origin = origin.copy() @@ -106,31 +110,27 @@ def origin_search( self, *, - url_pattern: str = None, + url_pattern: Optional[str] = None, metadata_pattern: str = None, with_visit: bool = False, - page_token: str = None, - count: int = 50, - ) -> Dict[str, object]: + page_token: Optional[bytes] = None, + limit: int = 50, + ) -> PagedResult[model.Origin]: """Searches for origins matching the `url_pattern`. Args: - url_pattern (str): Part of thr URL to search for - with_visit (bool): Whether origins with no visit are to be - filtered out - page_token (str): Opaque value used for pagination. - count (int): number of results to return. + url_pattern: Part of the URL to search for + with_visit: Whether origins with no visit are to be + filtered out + page_token: Opaque value used for pagination + limit: number of results to return Returns: - a dictionary with keys: - * `next_page_token`: - opaque value used for fetching more results. `None` if there - are no more result. - * `results`: - list of dictionaries with key: - * `url`: URL of a matching origin + PagedResult of Origin matching the search criteria. If next_page_token is + None, there is no longer data to retrieve. + """ - query_clauses = [] # type: List[Dict[str, Any]] + query_clauses: List[Dict[str, Any]] = [] if url_pattern: query_clauses.append( @@ -169,12 +169,13 @@ "At least one of url_pattern and metadata_pattern must be provided." ) + next_page_token = None + if with_visit: query_clauses.append({"term": {"has_visits": True,}}) body = { "query": {"bool": {"must": query_clauses,}}, - "size": count, "sort": [{"_score": "desc"}, {"sha1": "asc"},], } if page_token: @@ -185,29 +186,21 @@ page_token_content[b"sha1"].decode("ascii"), ] - res = self._backend.search(index="origin", body=body, size=count,) + res = self._backend.search(index="origin", body=body, size=limit) hits = res["hits"]["hits"] - if len(hits) == count: + if len(hits) == limit: last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"], b"sha1": last_hit["_source"]["sha1"], } - next_page_token = base64.b64encode( - msgpack.dumps(next_page_token_content) - ) # type: Optional[bytes] - else: - next_page_token = None - - return { - "next_page_token": next_page_token, - "results": [ - { - # TODO: also add 'id'? - "url": hit["_source"]["url"], - } - for hit in hits - ], - } + next_page_token = base64.b64encode(msgpack.dumps(next_page_token_content)) + + assert len(hits) <= limit + + return PagedResult( + results=[model.Origin(url=hit["_source"]["url"]) for hit in hits], + next_page_token=next_page_token, + ) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -1,18 +1,22 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 -from collections import defaultdict import itertools import re + +from collections import defaultdict from typing import Any, Dict, Iterable, Iterator, List, Optional import msgpack from swh.core.api import remote_api_endpoint from swh.model.identifiers import origin_identifier +from swh.model.model import Origin + +from swh.search.interface import PagedResult def _sanitize_origin(origin): @@ -61,15 +65,15 @@ def origin_search( self, *, - url_pattern: str = None, - metadata_pattern: str = None, + url_pattern: Optional[str] = None, + metadata_pattern: Optional[str] = None, with_visit: bool = False, - page_token: str = None, - count: int = 50, - ) -> Dict[str, object]: - matches = ( + page_token: Optional[bytes] = None, + limit: int = 50, + ) -> PagedResult[Origin]: + matches: Iterator[Dict[str, Any]] = ( self._origins[id_] for id_ in self._origin_ids - ) # type: Iterator[Dict[str, Any]] + ) if url_pattern: tokens = set(self._url_splitter.split(url_pattern)) @@ -100,6 +104,8 @@ "At least one of url_pattern and metadata_pattern must be provided." ) + next_page_token = None + if with_visit: matches = filter(lambda o: o.get("has_visits"), matches) @@ -109,19 +115,20 @@ else: start_at_index = 0 - hits = list(itertools.islice(matches, start_at_index, start_at_index + count)) + origins: List[Origin] = [ + Origin(url=hit["url"]) + for hit in itertools.islice( + matches, start_at_index, start_at_index + limit + 1 + ) + ] - if len(hits) == count: + if len(origins) > limit: next_page_token_content = { - b"start_at_index": start_at_index + count, + b"start_at_index": start_at_index + limit, } - next_page_token = base64.b64encode( - msgpack.dumps(next_page_token_content) - ) # type: Optional[bytes] - else: - next_page_token = None + origins = origins[:limit] + next_page_token = base64.b64encode(msgpack.dumps(next_page_token_content)) + + assert len(origins) <= limit - return { - "next_page_token": next_page_token, - "results": [{"url": hit["url"]} for hit in hits], - } + return PagedResult(results=origins, next_page_token=next_page_token,) diff --git a/swh/search/interface.py b/swh/search/interface.py new file mode 100644 --- /dev/null +++ b/swh/search/interface.py @@ -0,0 +1,12 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import TypeVar + +from swh.core.api.classes import PagedResult as CorePagedResult + + +TResult = TypeVar("TResult") +PagedResult = CorePagedResult[TResult, bytes] diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py --- a/swh/search/tests/test_cli.py +++ b/swh/search/tests/test_cli.py @@ -12,9 +12,11 @@ from confluent_kafka import Producer from click.testing import CliRunner +from swh.model.model import Origin from swh.journal.serializers import value_to_kafka from swh.search.cli import cli +from swh.search.interface import PagedResult CLI_CONFIG = """ @@ -82,17 +84,16 @@ swh_search.flush() # searching origin without visit as requirement - results = swh_search.origin_search(url_pattern="foobar") + actual_page = swh_search.origin_search(url_pattern="foobar") # We find it - assert results == { - "next_page_token": None, - "results": [{"url": "http://foobar.baz"}], - } + assert actual_page.next_page_token is None + assert actual_page.results == [Origin(url="http://foobar.baz")] # It's an origin with no visit, searching for it with visit - results = swh_search.origin_search(url_pattern="foobar", with_visit=True) + actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True) # returns nothing - assert results == {"next_page_token": None, "results": []} + assert actual_page.next_page_token is None + assert actual_page.results == [] def test__journal_client__origin_visit( @@ -128,15 +129,15 @@ swh_search.flush() - expected_result = { - "next_page_token": None, - "results": [{"url": "http://baz.foobar"}], - } + expected_page = PagedResult( + next_page_token=None, results=[Origin(url="http://baz.foobar")], + ) + # Both search returns the visit - results = swh_search.origin_search(url_pattern="foobar", with_visit=False) - assert results == expected_result - results = swh_search.origin_search(url_pattern="foobar", with_visit=True) - assert results == expected_result + actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=False) + assert actual_page == expected_page + actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True) + assert actual_page == expected_page def test__journal_client__missing_main_journal_config_key(elasticsearch_host): diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,125 +7,114 @@ from swh.search.utils import stream_results +from swh.model.model import Origin + class CommonSearchTest: def test_origin_url_unique_word_prefix(self): - self.search.origin_update( - [ - {"url": "http://foobar.baz"}, - {"url": "http://barbaz.qux"}, - {"url": "http://qux.quux"}, - ] - ) + origin_foobar = Origin(url="http://foobar.baz") + origin_barbaz = Origin(url="http://barbaz.qux") + origin_quux = Origin(url="http://qux.quux") + origins = [origin_foobar, origin_barbaz, origin_quux] + + self.search.origin_update([o.to_dict() for o in origins]) self.search.flush() - results = self.search.origin_search(url_pattern="foobar") - assert results == { - "next_page_token": None, - "results": [{"url": "http://foobar.baz"}], - } + actual_page = self.search.origin_search(url_pattern="foobar") + assert actual_page.next_page_token is None + assert actual_page.results == [origin_foobar] - results = self.search.origin_search(url_pattern="barb") - assert results == { - "next_page_token": None, - "results": [{"url": "http://barbaz.qux"}], - } + actual_page = self.search.origin_search(url_pattern="barb") + assert actual_page.next_page_token is None + assert actual_page.results == [origin_barbaz] # 'bar' is part of 'foobar', but is not the beginning of it - results = self.search.origin_search(url_pattern="bar") - assert results == { - "next_page_token": None, - "results": [{"url": "http://barbaz.qux"}], - } - - results = self.search.origin_search(url_pattern="barbaz") - assert results == { - "next_page_token": None, - "results": [{"url": "http://barbaz.qux"}], - } + actual_page = self.search.origin_search(url_pattern="bar") + assert actual_page.next_page_token is None + assert actual_page.results == [origin_barbaz] - def test_origin_url_unique_word_prefix_multiple_results(self): - self.search.origin_update( - [ - {"url": "http://foobar.baz"}, - {"url": "http://barbaz.qux"}, - {"url": "http://qux.quux"}, - ] - ) - self.search.flush() + actual_page = self.search.origin_search(url_pattern="barbaz") + assert actual_page.next_page_token is None + assert actual_page.results == [origin_barbaz] - results = self.search.origin_search(url_pattern="qu") - assert results["next_page_token"] is None + def test_origin_url_unique_word_prefix_multiple_results(self): + origin_foobar = Origin(url="http://foobar.baz") + origin_barbaz = Origin(url="http://barbaz.qux") + origin_quux = Origin(url="http://qux.quux") + origins = [origin_foobar, origin_barbaz, origin_quux] - results = [res["url"] for res in results["results"]] - expected_results = ["http://qux.quux", "http://barbaz.qux"] - assert sorted(results) == sorted(expected_results) + self.search.origin_update([o.to_dict() for o in origins]) + self.search.flush() - results = self.search.origin_search(url_pattern="qux") - assert results["next_page_token"] is None + actual_page = self.search.origin_search(url_pattern="qu") + assert actual_page.next_page_token is None + assert set(actual_page.results) == set([origin_quux, origin_barbaz]) - results = [res["url"] for res in results["results"]] - expected_results = ["http://barbaz.qux", "http://qux.quux"] - assert sorted(results) == sorted(expected_results) + actual_page = self.search.origin_search(url_pattern="qux") + assert actual_page.next_page_token is None + assert set(actual_page.results) == set([origin_quux, origin_barbaz]) def test_origin_url_all_terms(self): - self.search.origin_update( - [{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},] - ) + origin_foobarbaz = Origin(url="http://foo.bar/baz") + origin_foobarfoobar = Origin(url="http://foo.bar/foo.bar") + origins = [origin_foobarbaz, origin_foobarfoobar] + + self.search.origin_update([o.to_dict() for o in origins]) self.search.flush() # Only results containing all terms should be returned. - results = self.search.origin_search(url_pattern="foo bar baz") - assert results == { - "next_page_token": None, - "results": [{"url": "http://foo.bar/baz"},], - } + actual_page = self.search.origin_search(url_pattern="foo bar baz") + assert actual_page.next_page_token is None + assert set(actual_page.results) == set([origin_foobarbaz]) def test_origin_with_visit(self): + origin_foobarbaz = Origin(url="http://foobar/baz") + self.search.origin_update( - [{"url": "http://foobar.baz", "has_visits": True},] + [{**o.to_dict(), "has_visits": True} for o in [origin_foobarbaz]] ) self.search.flush() - results = self.search.origin_search(url_pattern="foobar", with_visit=True) - assert results == { - "next_page_token": None, - "results": [{"url": "http://foobar.baz"}], - } + actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert actual_page.next_page_token is None + assert actual_page.results == [origin_foobarbaz] def test_origin_with_visit_added(self): - self.search.origin_update( - [{"url": "http://foobar.baz"},] - ) + origin_foobarbaz = Origin(url="http://foobar.baz") + + self.search.origin_update([o.to_dict() for o in [origin_foobarbaz]]) self.search.flush() - results = self.search.origin_search(url_pattern="foobar", with_visit=True) - assert results == {"next_page_token": None, "results": []} + actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert actual_page.next_page_token is None + assert actual_page.results == [] self.search.origin_update( - [{"url": "http://foobar.baz", "has_visits": True},] + [{**o.to_dict(), "has_visits": True} for o in [origin_foobarbaz]] ) self.search.flush() - results = self.search.origin_search(url_pattern="foobar", with_visit=True) - assert results == { - "next_page_token": None, - "results": [{"url": "http://foobar.baz"}], - } + actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) + assert actual_page.next_page_token is None + assert actual_page.results == [origin_foobarbaz] def test_origin_intrinsic_metadata_description(self): + origin1_nothin = Origin(url="http://origin1") + origin2_foobar = Origin(url="http://origin2") + origin3_barbaz = Origin(url="http://origin3") + self.search.origin_update( [ - {"url": "http://origin1", "intrinsic_metadata": {},}, + {"url": origin1_nothin.url, "intrinsic_metadata": {},}, { - "url": "http://origin2", + "url": origin2_foobar.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, { - "url": "http://origin3", + "url": origin3_barbaz.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "bar baz", @@ -135,36 +124,34 @@ ) self.search.flush() - results = self.search.origin_search(metadata_pattern="foo") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin2"}], - } + actual_page = self.search.origin_search(metadata_pattern="foo") + assert actual_page.next_page_token is None + assert actual_page.results == [origin2_foobar] - results = self.search.origin_search(metadata_pattern="foo bar") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin2"}], - } + actual_page = self.search.origin_search(metadata_pattern="foo bar") + assert actual_page.next_page_token is None + assert actual_page.results == [origin2_foobar] - results = self.search.origin_search(metadata_pattern="bar baz") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin3"}], - } + actual_page = self.search.origin_search(metadata_pattern="bar baz") + + assert actual_page.next_page_token is None + assert actual_page.results == [origin3_barbaz] def test_origin_intrinsic_metadata_all_terms(self): + origin1_foobarfoobar = Origin(url="http://origin1") + origin3_foobarbaz = Origin(url="http://origin2") + self.search.origin_update( [ { - "url": "http://origin1", + "url": origin1_foobarfoobar.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar foo bar", }, }, { - "url": "http://origin3", + "url": origin3_foobarbaz.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar baz", @@ -174,25 +161,27 @@ ) self.search.flush() - results = self.search.origin_search(metadata_pattern="foo bar baz") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin3"}], - } + actual_page = self.search.origin_search(metadata_pattern="foo bar baz") + assert actual_page.next_page_token is None + assert actual_page.results == [origin3_foobarbaz] def test_origin_intrinsic_metadata_nested(self): + origin1_nothin = Origin(url="http://origin1") + origin2_foobar = Origin(url="http://origin2") + origin3_barbaz = Origin(url="http://origin3") + self.search.origin_update( [ - {"url": "http://origin1", "intrinsic_metadata": {},}, + {"url": origin1_nothin.url, "intrinsic_metadata": {},}, { - "url": "http://origin2", + "url": origin2_foobar.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { - "url": "http://origin3", + "url": origin3_barbaz.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["bar", "baz"], @@ -202,23 +191,17 @@ ) self.search.flush() - results = self.search.origin_search(metadata_pattern="foo") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin2"}], - } + actual_page = self.search.origin_search(metadata_pattern="foo") + assert actual_page.next_page_token is None + assert actual_page.results == [origin2_foobar] - results = self.search.origin_search(metadata_pattern="foo bar") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin2"}], - } + actual_page = self.search.origin_search(metadata_pattern="foo bar") + assert actual_page.next_page_token is None + assert actual_page.results == [origin2_foobar] - results = self.search.origin_search(metadata_pattern="bar baz") - assert results == { - "next_page_token": None, - "results": [{"url": "http://origin3"}], - } + actual_page = self.search.origin_search(metadata_pattern="bar baz") + assert actual_page.next_page_token is None + assert actual_page.results == [origin3_barbaz] # TODO: add more tests with more codemeta terms @@ -226,71 +209,60 @@ @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) - def test_origin_url_paging(self, count): + def test_origin_url_paging(self, limit): # TODO: no hypothesis + origin1_foo = Origin(url="http://origin1/foo") + origin2_foobar = Origin(url="http://origin2/foo/bar") + origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz") + self.reset() self.search.origin_update( - [ - {"url": "http://origin1/foo"}, - {"url": "http://origin2/foo/bar"}, - {"url": "http://origin3/foo/bar/baz"}, - ] + [o.to_dict() for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]] ) self.search.flush() results = stream_results( - self.search.origin_search, url_pattern="foo bar baz", count=count + self.search.origin_search, url_pattern="foo bar baz", limit=limit ) - results = [res["url"] for res in results] - expected_results = [ - "http://origin3/foo/bar/baz", - ] - assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) + assert set(results) == set([origin3_foobarbaz]) results = stream_results( - self.search.origin_search, url_pattern="foo bar", count=count + self.search.origin_search, url_pattern="foo bar", limit=limit ) - expected_results = [ - "http://origin2/foo/bar", - "http://origin3/foo/bar/baz", - ] - results = [res["url"] for res in results] - assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) + assert set(results) == set([origin2_foobar, origin3_foobarbaz]) results = stream_results( - self.search.origin_search, url_pattern="foo", count=count + self.search.origin_search, url_pattern="foo", limit=limit ) - expected_results = [ - "http://origin1/foo", - "http://origin2/foo/bar", - "http://origin3/foo/bar/baz", - ] - results = [res["url"] for res in results] - assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) + assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz]) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) - def test_origin_intrinsic_metadata_paging(self, count): + def test_origin_intrinsic_metadata_paging(self, limit): # TODO: no hypothesis + origin1_foo = Origin(url="http://origin1/foo") + origin2_foobar = Origin(url="http://origin2/foo/bar") + origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz") + self.reset() self.search.origin_update( [ { - "url": "http://origin1", + "url": origin1_foo.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo"], }, }, { - "url": "http://origin2", + "url": origin2_foobar.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { - "url": "http://origin3", + "url": origin3_foobarbaz.url, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar", "baz"], @@ -301,20 +273,16 @@ self.search.flush() results = stream_results( - self.search.origin_search, metadata_pattern="foo bar baz", count=count + self.search.origin_search, metadata_pattern="foo bar baz", limit=limit ) - assert list(results) == [{"url": "http://origin3"}] + assert set(results) == set([origin3_foobarbaz]) results = stream_results( - self.search.origin_search, metadata_pattern="foo bar", count=count + self.search.origin_search, metadata_pattern="foo bar", limit=limit ) - assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}] + assert set(results) == set([origin2_foobar, origin3_foobarbaz]) results = stream_results( - self.search.origin_search, metadata_pattern="foo", count=count + self.search.origin_search, metadata_pattern="foo", limit=limit ) - assert list(results) == [ - {"url": "http://origin1"}, - {"url": "http://origin2"}, - {"url": "http://origin3"}, - ] + assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz]) diff --git a/swh/search/tests/test_serializers.py b/swh/search/tests/test_serializers.py new file mode 100644 --- /dev/null +++ b/swh/search/tests/test_serializers.py @@ -0,0 +1,18 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.model.model import Origin + +from swh.search.api.serializers import _encode_model_object, _decode_model_object + + +def test_serialization_back_and_forth(): + """Testing serialization on origin back and forth should be fine + + """ + origin = Origin(url="foobar") + + assert _decode_model_object(_encode_model_object(origin)) == origin diff --git a/swh/search/utils.py b/swh/search/utils.py --- a/swh/search/utils.py +++ b/swh/search/utils.py @@ -1,16 +1,19 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def stream_results(f, *args, **kwargs): + """Consume the paginated result and stream it directly + + """ if "page_token" in kwargs: raise TypeError('stream_results has no argument "page_token".') page_token = None while True: results = f(*args, page_token=page_token, **kwargs) - yield from results["results"] - page_token = results["next_page_token"] + yield from results.results + page_token = results.next_page_token if page_token is None: break