Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11023497
D3657.id12881.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
33 KB
Subscribers
None
D3657.id12881.diff
View Options
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[http]
+swh.core[http] >= 0.2.0
swh.journal >= 0.1.0
swh.model
diff --git a/swh/search/api/client.py b/swh/search/api/client.py
--- a/swh/search/api/client.py
+++ b/swh/search/api/client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,9 +6,13 @@
from swh.core.api import RPCClient
from ..elasticsearch import ElasticSearch
+from .serializers import DECODERS, ENCODERS
class RemoteSearch(RPCClient):
"""Proxy to a remote search API"""
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
backend_class = ElasticSearch
diff --git a/swh/search/api/serializers.py b/swh/search/api/serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/search/api/serializers.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Extra decoder(s)/encoder(s) for swh-model objects."""
+
+from typing import Callable, Dict, List, Tuple
+
+import swh.model.model as model
+
+
+def _encode_model_object(obj):
+ d = obj.to_dict()
+ d["__type__"] = type(obj).__name__
+ return d
+
+
+def _decode_model_object(d):
+ return getattr(model, d.pop("__type__")).from_dict(d)
+
+
+ENCODERS: List[Tuple[type, str, Callable]] = [
+ (model.BaseModel, "model", _encode_model_object),
+]
+
+
+DECODERS: Dict[str, Callable] = {
+ "model": _decode_model_object,
+}
diff --git a/swh/search/api/server.py b/swh/search/api/server.py
--- a/swh/search/api/server.py
+++ b/swh/search/api/server.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -11,6 +11,7 @@
from .. import get_search
from ..elasticsearch import ElasticSearch
+from .serializers import DECODERS, ENCODERS
def _get_search():
@@ -21,8 +22,14 @@
return search
-app = RPCServerApp(__name__, backend_class=ElasticSearch, backend_factory=_get_search)
+class SearchServerApp(RPCServerApp):
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
+app = SearchServerApp(
+ __name__, backend_class=ElasticSearch, backend_factory=_get_search
+)
search = None
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -1,19 +1,21 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import base64
+import msgpack
+
from typing import Any, Iterable, Dict, List, Iterator, Optional
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
-import msgpack
from swh.core.api import remote_api_endpoint
from swh.model import model
from swh.model.identifiers import origin_identifier
+from swh.search.interface import PagedResult
+
def _sanitize_origin(origin):
origin = origin.copy()
@@ -106,31 +108,27 @@
def origin_search(
self,
*,
- url_pattern: str = None,
+ url_pattern: Optional[str] = None,
metadata_pattern: str = None,
with_visit: bool = False,
- page_token: str = None,
- count: int = 50,
- ) -> Dict[str, object]:
+ page_token: Optional[str] = None,
+ limit: int = 50,
+ ) -> PagedResult[model.Origin]:
"""Searches for origins matching the `url_pattern`.
Args:
- url_pattern (str): Part of thr URL to search for
- with_visit (bool): Whether origins with no visit are to be
- filtered out
- page_token (str): Opaque value used for pagination.
- count (int): number of results to return.
+ url_pattern: Part of the URL to search for
+ with_visit: Whether origins with no visit are to be
+ filtered out
+ page_token: Opaque value used for pagination
+ limit: number of results to return
Returns:
- a dictionary with keys:
- * `next_page_token`:
- opaque value used for fetching more results. `None` if there
- are no more result.
- * `results`:
- list of dictionaries with key:
- * `url`: URL of a matching origin
+ PagedResult of Origin matching the search criteria. If next_page_token is
+ None, there is no longer data to retrieve.
+
"""
- query_clauses = [] # type: List[Dict[str, Any]]
+ query_clauses: List[Dict[str, Any]] = []
if url_pattern:
query_clauses.append(
@@ -169,45 +167,38 @@
"At least one of url_pattern and metadata_pattern must be provided."
)
+ next_page_token = None
+
if with_visit:
query_clauses.append({"term": {"has_visits": True,}})
body = {
"query": {"bool": {"must": query_clauses,}},
- "size": count,
"sort": [{"_score": "desc"}, {"sha1": "asc"},],
}
if page_token:
# TODO: use ElasticSearch's scroll API?
- page_token_content = msgpack.loads(base64.b64decode(page_token), raw=True)
+ page_token_content = msgpack.loads(page_token, raw=True)
body["search_after"] = [
page_token_content[b"score"],
page_token_content[b"sha1"].decode("ascii"),
]
- res = self._backend.search(index="origin", body=body, size=count,)
+ res = self._backend.search(index="origin", body=body, size=limit)
hits = res["hits"]["hits"]
- if len(hits) == count:
+ if len(hits) == limit:
last_hit = hits[-1]
next_page_token_content = {
b"score": last_hit["_score"],
b"sha1": last_hit["_source"]["sha1"],
}
- next_page_token = base64.b64encode(
- msgpack.dumps(next_page_token_content)
- ) # type: Optional[bytes]
- else:
- next_page_token = None
-
- return {
- "next_page_token": next_page_token,
- "results": [
- {
- # TODO: also add 'id'?
- "url": hit["_source"]["url"],
- }
- for hit in hits
- ],
- }
+ next_page_token = msgpack.dumps(next_page_token_content)
+
+ assert len(hits) <= limit
+
+ return PagedResult(
+ results=[model.Origin(url=hit["_source"]["url"]) for hit in hits],
+ next_page_token=next_page_token,
+ )
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,18 +1,19 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import base64
-from collections import defaultdict
import itertools
import re
-from typing import Any, Dict, Iterable, Iterator, List, Optional
-import msgpack
+from collections import defaultdict
+from typing import Any, Dict, Iterable, Iterator, List, Optional
from swh.core.api import remote_api_endpoint
from swh.model.identifiers import origin_identifier
+from swh.model.model import Origin
+
+from swh.search.interface import PagedResult
def _sanitize_origin(origin):
@@ -61,15 +62,15 @@
def origin_search(
self,
*,
- url_pattern: str = None,
- metadata_pattern: str = None,
+ url_pattern: Optional[str] = None,
+ metadata_pattern: Optional[str] = None,
with_visit: bool = False,
- page_token: str = None,
- count: int = 50,
- ) -> Dict[str, object]:
- matches = (
+ page_token: Optional[str] = None,
+ limit: int = 50,
+ ) -> PagedResult[Origin]:
+ hits: Iterator[Dict[str, Any]] = (
self._origins[id_] for id_ in self._origin_ids
- ) # type: Iterator[Dict[str, Any]]
+ )
if url_pattern:
tokens = set(self._url_splitter.split(url_pattern))
@@ -88,7 +89,7 @@
for token in match["_url_tokens"]
)
- matches = filter(predicate, matches)
+ hits = filter(predicate, hits)
if metadata_pattern:
raise NotImplementedError(
@@ -100,28 +101,24 @@
"At least one of url_pattern and metadata_pattern must be provided."
)
+ next_page_token = None
+
if with_visit:
- matches = filter(lambda o: o.get("has_visits"), matches)
-
- if page_token:
- page_token_content = msgpack.loads(base64.b64decode(page_token))
- start_at_index = page_token_content[b"start_at_index"]
- else:
- start_at_index = 0
-
- hits = list(itertools.islice(matches, start_at_index, start_at_index + count))
-
- if len(hits) == count:
- next_page_token_content = {
- b"start_at_index": start_at_index + count,
- }
- next_page_token = base64.b64encode(
- msgpack.dumps(next_page_token_content)
- ) # type: Optional[bytes]
- else:
- next_page_token = None
-
- return {
- "next_page_token": next_page_token,
- "results": [{"url": hit["url"]} for hit in hits],
- }
+ hits = filter(lambda o: o.get("has_visits"), hits)
+
+ start_at_index = int(page_token) if page_token else 0
+
+ origins: List[Origin] = [
+ Origin(url=hit["url"])
+ for hit in itertools.islice(
+ hits, start_at_index, start_at_index + limit + 1
+ )
+ ]
+
+ if len(origins) > limit:
+ next_page_token = str(start_at_index + limit)
+ origins = origins[:limit]
+
+ assert len(origins) <= limit
+
+ return PagedResult(results=origins, next_page_token=next_page_token,)
diff --git a/swh/search/interface.py b/swh/search/interface.py
new file mode 100644
--- /dev/null
+++ b/swh/search/interface.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import TypeVar
+
+from swh.core.api.classes import PagedResult as CorePagedResult
+
+
+TResult = TypeVar("TResult")
+PagedResult = CorePagedResult[TResult, str]
diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
--- a/swh/search/tests/test_cli.py
+++ b/swh/search/tests/test_cli.py
@@ -12,9 +12,11 @@
from confluent_kafka import Producer
from click.testing import CliRunner
+from swh.model.model import Origin
from swh.journal.serializers import value_to_kafka
from swh.search.cli import cli
+from swh.search.interface import PagedResult
CLI_CONFIG = """
@@ -82,17 +84,16 @@
swh_search.flush()
# searching origin without visit as requirement
- results = swh_search.origin_search(url_pattern="foobar")
+ actual_page = swh_search.origin_search(url_pattern="foobar")
# We find it
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [Origin(url="http://foobar.baz")]
# It's an origin with no visit, searching for it with visit
- results = swh_search.origin_search(url_pattern="foobar", with_visit=True)
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
# returns nothing
- assert results == {"next_page_token": None, "results": []}
+ assert actual_page.next_page_token is None
+ assert actual_page.results == []
def test__journal_client__origin_visit(
@@ -128,15 +129,15 @@
swh_search.flush()
- expected_result = {
- "next_page_token": None,
- "results": [{"url": "http://baz.foobar"}],
- }
+ expected_page = PagedResult(
+ next_page_token=None, results=[Origin(url="http://baz.foobar")],
+ )
+
# Both search returns the visit
- results = swh_search.origin_search(url_pattern="foobar", with_visit=False)
- assert results == expected_result
- results = swh_search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == expected_result
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=False)
+ assert actual_page == expected_page
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page == expected_page
def test__journal_client__missing_main_journal_config_key(elasticsearch_host):
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,125 +7,114 @@
from swh.search.utils import stream_results
+from swh.model.model import Origin
+
class CommonSearchTest:
def test_origin_url_unique_word_prefix(self):
- self.search.origin_update(
- [
- {"url": "http://foobar.baz"},
- {"url": "http://barbaz.qux"},
- {"url": "http://qux.quux"},
- ]
- )
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+ origin_barbaz_qux = Origin(url="http://barbaz.qux")
+ origin_qux_quux = Origin(url="http://qux.quux")
+ origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
+
+ self.search.origin_update([o.to_dict() for o in origins])
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
- results = self.search.origin_search(url_pattern="barb")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
+ actual_page = self.search.origin_search(url_pattern="barb")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
# 'bar' is part of 'foobar', but is not the beginning of it
- results = self.search.origin_search(url_pattern="bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
-
- results = self.search.origin_search(url_pattern="barbaz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
+ actual_page = self.search.origin_search(url_pattern="bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
- def test_origin_url_unique_word_prefix_multiple_results(self):
- self.search.origin_update(
- [
- {"url": "http://foobar.baz"},
- {"url": "http://barbaz.qux"},
- {"url": "http://qux.quux"},
- ]
- )
- self.search.flush()
+ actual_page = self.search.origin_search(url_pattern="barbaz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
- results = self.search.origin_search(url_pattern="qu")
- assert results["next_page_token"] is None
+ def test_origin_url_unique_word_prefix_multiple_results(self):
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+ origin_barbaz_qux = Origin(url="http://barbaz.qux")
+ origin_qux_quux = Origin(url="http://qux.quux")
+ origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
- results = [res["url"] for res in results["results"]]
- expected_results = ["http://qux.quux", "http://barbaz.qux"]
- assert sorted(results) == sorted(expected_results)
+ self.search.origin_update([o.to_dict() for o in origins])
+ self.search.flush()
- results = self.search.origin_search(url_pattern="qux")
- assert results["next_page_token"] is None
+ actual_page = self.search.origin_search(url_pattern="qu")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux])
- results = [res["url"] for res in results["results"]]
- expected_results = ["http://barbaz.qux", "http://qux.quux"]
- assert sorted(results) == sorted(expected_results)
+ actual_page = self.search.origin_search(url_pattern="qux")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux])
def test_origin_url_all_terms(self):
- self.search.origin_update(
- [{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},]
- )
+ origin_foo_bar_baz = Origin(url="http://foo.bar/baz")
+ origin_foo_bar_foo_bar = Origin(url="http://foo.bar/foo.bar")
+ origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar]
+
+ self.search.origin_update([o.to_dict() for o in origins])
self.search.flush()
# Only results containing all terms should be returned.
- results = self.search.origin_search(url_pattern="foo bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foo.bar/baz"},],
- }
+ actual_page = self.search.origin_search(url_pattern="foo bar baz")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_foo_bar_baz])
def test_origin_with_visit(self):
+ origin_foobar_baz = Origin(url="http://foobar/baz")
+
self.search.origin_update(
- [{"url": "http://foobar.baz", "has_visits": True},]
+ [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
def test_origin_with_visit_added(self):
- self.search.origin_update(
- [{"url": "http://foobar.baz"},]
- )
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+
+ self.search.origin_update([o.to_dict() for o in [origin_foobar_baz]])
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {"next_page_token": None, "results": []}
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == []
self.search.origin_update(
- [{"url": "http://foobar.baz", "has_visits": True},]
+ [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
def test_origin_intrinsic_metadata_description(self):
+ origin1_nothin = Origin(url="http://origin1")
+ origin2_foobar = Origin(url="http://origin2")
+ origin3_barbaz = Origin(url="http://origin3")
+
self.search.origin_update(
[
- {"url": "http://origin1", "intrinsic_metadata": {},},
+ {"url": origin1_nothin.url, "intrinsic_metadata": {},},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
},
},
{
- "url": "http://origin3",
+ "url": origin3_barbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "bar baz",
@@ -135,36 +124,34 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="foo bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="bar baz")
+
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_all_terms(self):
+ origin1_foobarfoobar = Origin(url="http://origin1")
+ origin3_foobarbaz = Origin(url="http://origin2")
+
self.search.origin_update(
[
{
- "url": "http://origin1",
+ "url": origin1_foobarfoobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar foo bar",
},
},
{
- "url": "http://origin3",
+ "url": origin3_foobarbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar baz",
@@ -174,25 +161,27 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_foobarbaz]
def test_origin_intrinsic_metadata_nested(self):
+ origin1_nothin = Origin(url="http://origin1")
+ origin2_foobar = Origin(url="http://origin2")
+ origin3_barbaz = Origin(url="http://origin3")
+
self.search.origin_update(
[
- {"url": "http://origin1", "intrinsic_metadata": {},},
+ {"url": origin1_nothin.url, "intrinsic_metadata": {},},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
- "url": "http://origin3",
+ "url": origin3_barbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["bar", "baz"],
@@ -202,23 +191,17 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="foo bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="bar baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_barbaz]
# TODO: add more tests with more codemeta terms
@@ -226,71 +209,60 @@
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
- def test_origin_url_paging(self, count):
+ def test_origin_url_paging(self, limit):
# TODO: no hypothesis
+ origin1_foo = Origin(url="http://origin1/foo")
+ origin2_foobar = Origin(url="http://origin2/foo/bar")
+ origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz")
+
self.reset()
self.search.origin_update(
- [
- {"url": "http://origin1/foo"},
- {"url": "http://origin2/foo/bar"},
- {"url": "http://origin3/foo/bar/baz"},
- ]
+ [o.to_dict() for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]]
)
self.search.flush()
results = stream_results(
- self.search.origin_search, url_pattern="foo bar baz", count=count
+ self.search.origin_search, url_pattern="foo bar baz", limit=limit
)
- results = [res["url"] for res in results]
- expected_results = [
- "http://origin3/foo/bar/baz",
- ]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, url_pattern="foo bar", count=count
+ self.search.origin_search, url_pattern="foo bar", limit=limit
)
- expected_results = [
- "http://origin2/foo/bar",
- "http://origin3/foo/bar/baz",
- ]
- results = [res["url"] for res in results]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin2_foobar, origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, url_pattern="foo", count=count
+ self.search.origin_search, url_pattern="foo", limit=limit
)
- expected_results = [
- "http://origin1/foo",
- "http://origin2/foo/bar",
- "http://origin3/foo/bar/baz",
- ]
- results = [res["url"] for res in results]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz])
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
- def test_origin_intrinsic_metadata_paging(self, count):
+ def test_origin_intrinsic_metadata_paging(self, limit):
# TODO: no hypothesis
+ origin1_foo = Origin(url="http://origin1/foo")
+ origin2_foobar = Origin(url="http://origin2/foo/bar")
+ origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz")
+
self.reset()
self.search.origin_update(
[
{
- "url": "http://origin1",
+ "url": origin1_foo.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo"],
},
},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
- "url": "http://origin3",
+ "url": origin3_foobarbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar", "baz"],
@@ -301,20 +273,16 @@
self.search.flush()
results = stream_results(
- self.search.origin_search, metadata_pattern="foo bar baz", count=count
+ self.search.origin_search, metadata_pattern="foo bar baz", limit=limit
)
- assert list(results) == [{"url": "http://origin3"}]
+ assert set(results) == set([origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, metadata_pattern="foo bar", count=count
+ self.search.origin_search, metadata_pattern="foo bar", limit=limit
)
- assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}]
+ assert set(results) == set([origin2_foobar, origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, metadata_pattern="foo", count=count
+ self.search.origin_search, metadata_pattern="foo", limit=limit
)
- assert list(results) == [
- {"url": "http://origin1"},
- {"url": "http://origin2"},
- {"url": "http://origin3"},
- ]
+ assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz])
diff --git a/swh/search/tests/test_serializers.py b/swh/search/tests/test_serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/search/tests/test_serializers.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from swh.model.model import Origin
+
+from swh.search.api.serializers import _encode_model_object, _decode_model_object
+
+
+def test_serialization_back_and_forth():
+ """Testing serialization on origin back and forth should be fine
+
+ """
+ origin = Origin(url="foobar")
+
+ assert _decode_model_object(_encode_model_object(origin)) == origin
diff --git a/swh/search/utils.py b/swh/search/utils.py
--- a/swh/search/utils.py
+++ b/swh/search/utils.py
@@ -1,16 +1,19 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def stream_results(f, *args, **kwargs):
+ """Consume the paginated result and stream it directly
+
+ """
if "page_token" in kwargs:
raise TypeError('stream_results has no argument "page_token".')
page_token = None
while True:
results = f(*args, page_token=page_token, **kwargs)
- yield from results["results"]
- page_token = results["next_page_token"]
+ yield from results.results
+ page_token = results.next_page_token
if page_token is None:
break
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Sep 17, 4:40 PM (3 h, 56 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229296
Attached To
D3657: search*: Type origin_search(...) -> PagedResult[Dict]
Event Timeline
Log In to Comment