Page MenuHomeSoftware Heritage

D3657.id12881.diff
No OneTemporary

D3657.id12881.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
# Add here internal Software Heritage dependencies, one per line.
-swh.core[http]
+swh.core[http] >= 0.2.0
swh.journal >= 0.1.0
swh.model
diff --git a/swh/search/api/client.py b/swh/search/api/client.py
--- a/swh/search/api/client.py
+++ b/swh/search/api/client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,9 +6,13 @@
from swh.core.api import RPCClient
from ..elasticsearch import ElasticSearch
+from .serializers import DECODERS, ENCODERS
class RemoteSearch(RPCClient):
"""Proxy to a remote search API"""
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
backend_class = ElasticSearch
diff --git a/swh/search/api/serializers.py b/swh/search/api/serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/search/api/serializers.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Extra decoder(s)/encoder(s) for swh-model objects."""
+
+from typing import Callable, Dict, List, Tuple
+
+import swh.model.model as model
+
+
+def _encode_model_object(obj):
+ d = obj.to_dict()
+ d["__type__"] = type(obj).__name__
+ return d
+
+
+def _decode_model_object(d):
+ return getattr(model, d.pop("__type__")).from_dict(d)
+
+
+ENCODERS: List[Tuple[type, str, Callable]] = [
+ (model.BaseModel, "model", _encode_model_object),
+]
+
+
+DECODERS: Dict[str, Callable] = {
+ "model": _decode_model_object,
+}
diff --git a/swh/search/api/server.py b/swh/search/api/server.py
--- a/swh/search/api/server.py
+++ b/swh/search/api/server.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -11,6 +11,7 @@
from .. import get_search
from ..elasticsearch import ElasticSearch
+from .serializers import DECODERS, ENCODERS
def _get_search():
@@ -21,8 +22,14 @@
return search
-app = RPCServerApp(__name__, backend_class=ElasticSearch, backend_factory=_get_search)
+class SearchServerApp(RPCServerApp):
+ extra_type_decoders = DECODERS
+ extra_type_encoders = ENCODERS
+
+app = SearchServerApp(
+ __name__, backend_class=ElasticSearch, backend_factory=_get_search
+)
search = None
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -1,19 +1,21 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import base64
+import msgpack
+
from typing import Any, Iterable, Dict, List, Iterator, Optional
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan
-import msgpack
from swh.core.api import remote_api_endpoint
from swh.model import model
from swh.model.identifiers import origin_identifier
+from swh.search.interface import PagedResult
+
def _sanitize_origin(origin):
origin = origin.copy()
@@ -106,31 +108,27 @@
def origin_search(
self,
*,
- url_pattern: str = None,
+ url_pattern: Optional[str] = None,
metadata_pattern: str = None,
with_visit: bool = False,
- page_token: str = None,
- count: int = 50,
- ) -> Dict[str, object]:
+ page_token: Optional[str] = None,
+ limit: int = 50,
+ ) -> PagedResult[model.Origin]:
"""Searches for origins matching the `url_pattern`.
Args:
- url_pattern (str): Part of thr URL to search for
- with_visit (bool): Whether origins with no visit are to be
- filtered out
- page_token (str): Opaque value used for pagination.
- count (int): number of results to return.
+ url_pattern: Part of the URL to search for
+ with_visit: Whether origins with no visit are to be
+ filtered out
+ page_token: Opaque value used for pagination
+ limit: number of results to return
Returns:
- a dictionary with keys:
- * `next_page_token`:
- opaque value used for fetching more results. `None` if there
- are no more result.
- * `results`:
- list of dictionaries with key:
- * `url`: URL of a matching origin
+ PagedResult of Origin matching the search criteria. If next_page_token is
+ None, there is no longer data to retrieve.
+
"""
- query_clauses = [] # type: List[Dict[str, Any]]
+ query_clauses: List[Dict[str, Any]] = []
if url_pattern:
query_clauses.append(
@@ -169,45 +167,38 @@
"At least one of url_pattern and metadata_pattern must be provided."
)
+ next_page_token = None
+
if with_visit:
query_clauses.append({"term": {"has_visits": True,}})
body = {
"query": {"bool": {"must": query_clauses,}},
- "size": count,
"sort": [{"_score": "desc"}, {"sha1": "asc"},],
}
if page_token:
# TODO: use ElasticSearch's scroll API?
- page_token_content = msgpack.loads(base64.b64decode(page_token), raw=True)
+ page_token_content = msgpack.loads(page_token, raw=True)
body["search_after"] = [
page_token_content[b"score"],
page_token_content[b"sha1"].decode("ascii"),
]
- res = self._backend.search(index="origin", body=body, size=count,)
+ res = self._backend.search(index="origin", body=body, size=limit)
hits = res["hits"]["hits"]
- if len(hits) == count:
+ if len(hits) == limit:
last_hit = hits[-1]
next_page_token_content = {
b"score": last_hit["_score"],
b"sha1": last_hit["_source"]["sha1"],
}
- next_page_token = base64.b64encode(
- msgpack.dumps(next_page_token_content)
- ) # type: Optional[bytes]
- else:
- next_page_token = None
-
- return {
- "next_page_token": next_page_token,
- "results": [
- {
- # TODO: also add 'id'?
- "url": hit["_source"]["url"],
- }
- for hit in hits
- ],
- }
+ next_page_token = msgpack.dumps(next_page_token_content)
+
+ assert len(hits) <= limit
+
+ return PagedResult(
+ results=[model.Origin(url=hit["_source"]["url"]) for hit in hits],
+ next_page_token=next_page_token,
+ )
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,18 +1,19 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import base64
-from collections import defaultdict
import itertools
import re
-from typing import Any, Dict, Iterable, Iterator, List, Optional
-import msgpack
+from collections import defaultdict
+from typing import Any, Dict, Iterable, Iterator, List, Optional
from swh.core.api import remote_api_endpoint
from swh.model.identifiers import origin_identifier
+from swh.model.model import Origin
+
+from swh.search.interface import PagedResult
def _sanitize_origin(origin):
@@ -61,15 +62,15 @@
def origin_search(
self,
*,
- url_pattern: str = None,
- metadata_pattern: str = None,
+ url_pattern: Optional[str] = None,
+ metadata_pattern: Optional[str] = None,
with_visit: bool = False,
- page_token: str = None,
- count: int = 50,
- ) -> Dict[str, object]:
- matches = (
+ page_token: Optional[str] = None,
+ limit: int = 50,
+ ) -> PagedResult[Origin]:
+ hits: Iterator[Dict[str, Any]] = (
self._origins[id_] for id_ in self._origin_ids
- ) # type: Iterator[Dict[str, Any]]
+ )
if url_pattern:
tokens = set(self._url_splitter.split(url_pattern))
@@ -88,7 +89,7 @@
for token in match["_url_tokens"]
)
- matches = filter(predicate, matches)
+ hits = filter(predicate, hits)
if metadata_pattern:
raise NotImplementedError(
@@ -100,28 +101,24 @@
"At least one of url_pattern and metadata_pattern must be provided."
)
+ next_page_token = None
+
if with_visit:
- matches = filter(lambda o: o.get("has_visits"), matches)
-
- if page_token:
- page_token_content = msgpack.loads(base64.b64decode(page_token))
- start_at_index = page_token_content[b"start_at_index"]
- else:
- start_at_index = 0
-
- hits = list(itertools.islice(matches, start_at_index, start_at_index + count))
-
- if len(hits) == count:
- next_page_token_content = {
- b"start_at_index": start_at_index + count,
- }
- next_page_token = base64.b64encode(
- msgpack.dumps(next_page_token_content)
- ) # type: Optional[bytes]
- else:
- next_page_token = None
-
- return {
- "next_page_token": next_page_token,
- "results": [{"url": hit["url"]} for hit in hits],
- }
+ hits = filter(lambda o: o.get("has_visits"), hits)
+
+ start_at_index = int(page_token) if page_token else 0
+
+ origins: List[Origin] = [
+ Origin(url=hit["url"])
+ for hit in itertools.islice(
+ hits, start_at_index, start_at_index + limit + 1
+ )
+ ]
+
+ if len(origins) > limit:
+ next_page_token = str(start_at_index + limit)
+ origins = origins[:limit]
+
+ assert len(origins) <= limit
+
+ return PagedResult(results=origins, next_page_token=next_page_token,)
diff --git a/swh/search/interface.py b/swh/search/interface.py
new file mode 100644
--- /dev/null
+++ b/swh/search/interface.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import TypeVar
+
+from swh.core.api.classes import PagedResult as CorePagedResult
+
+
+TResult = TypeVar("TResult")
+PagedResult = CorePagedResult[TResult, str]
diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
--- a/swh/search/tests/test_cli.py
+++ b/swh/search/tests/test_cli.py
@@ -12,9 +12,11 @@
from confluent_kafka import Producer
from click.testing import CliRunner
+from swh.model.model import Origin
from swh.journal.serializers import value_to_kafka
from swh.search.cli import cli
+from swh.search.interface import PagedResult
CLI_CONFIG = """
@@ -82,17 +84,16 @@
swh_search.flush()
# searching origin without visit as requirement
- results = swh_search.origin_search(url_pattern="foobar")
+ actual_page = swh_search.origin_search(url_pattern="foobar")
# We find it
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [Origin(url="http://foobar.baz")]
# It's an origin with no visit, searching for it with visit
- results = swh_search.origin_search(url_pattern="foobar", with_visit=True)
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
# returns nothing
- assert results == {"next_page_token": None, "results": []}
+ assert actual_page.next_page_token is None
+ assert actual_page.results == []
def test__journal_client__origin_visit(
@@ -128,15 +129,15 @@
swh_search.flush()
- expected_result = {
- "next_page_token": None,
- "results": [{"url": "http://baz.foobar"}],
- }
+ expected_page = PagedResult(
+ next_page_token=None, results=[Origin(url="http://baz.foobar")],
+ )
+
# Both search returns the visit
- results = swh_search.origin_search(url_pattern="foobar", with_visit=False)
- assert results == expected_result
- results = swh_search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == expected_result
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=False)
+ assert actual_page == expected_page
+ actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page == expected_page
def test__journal_client__missing_main_journal_config_key(elasticsearch_host):
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,125 +7,114 @@
from swh.search.utils import stream_results
+from swh.model.model import Origin
+
class CommonSearchTest:
def test_origin_url_unique_word_prefix(self):
- self.search.origin_update(
- [
- {"url": "http://foobar.baz"},
- {"url": "http://barbaz.qux"},
- {"url": "http://qux.quux"},
- ]
- )
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+ origin_barbaz_qux = Origin(url="http://barbaz.qux")
+ origin_qux_quux = Origin(url="http://qux.quux")
+ origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
+
+ self.search.origin_update([o.to_dict() for o in origins])
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
- results = self.search.origin_search(url_pattern="barb")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
+ actual_page = self.search.origin_search(url_pattern="barb")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
# 'bar' is part of 'foobar', but is not the beginning of it
- results = self.search.origin_search(url_pattern="bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
-
- results = self.search.origin_search(url_pattern="barbaz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://barbaz.qux"}],
- }
+ actual_page = self.search.origin_search(url_pattern="bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
- def test_origin_url_unique_word_prefix_multiple_results(self):
- self.search.origin_update(
- [
- {"url": "http://foobar.baz"},
- {"url": "http://barbaz.qux"},
- {"url": "http://qux.quux"},
- ]
- )
- self.search.flush()
+ actual_page = self.search.origin_search(url_pattern="barbaz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_barbaz_qux]
- results = self.search.origin_search(url_pattern="qu")
- assert results["next_page_token"] is None
+ def test_origin_url_unique_word_prefix_multiple_results(self):
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+ origin_barbaz_qux = Origin(url="http://barbaz.qux")
+ origin_qux_quux = Origin(url="http://qux.quux")
+ origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux]
- results = [res["url"] for res in results["results"]]
- expected_results = ["http://qux.quux", "http://barbaz.qux"]
- assert sorted(results) == sorted(expected_results)
+ self.search.origin_update([o.to_dict() for o in origins])
+ self.search.flush()
- results = self.search.origin_search(url_pattern="qux")
- assert results["next_page_token"] is None
+ actual_page = self.search.origin_search(url_pattern="qu")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux])
- results = [res["url"] for res in results["results"]]
- expected_results = ["http://barbaz.qux", "http://qux.quux"]
- assert sorted(results) == sorted(expected_results)
+ actual_page = self.search.origin_search(url_pattern="qux")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux])
def test_origin_url_all_terms(self):
- self.search.origin_update(
- [{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},]
- )
+ origin_foo_bar_baz = Origin(url="http://foo.bar/baz")
+ origin_foo_bar_foo_bar = Origin(url="http://foo.bar/foo.bar")
+ origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar]
+
+ self.search.origin_update([o.to_dict() for o in origins])
self.search.flush()
# Only results containing all terms should be returned.
- results = self.search.origin_search(url_pattern="foo bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foo.bar/baz"},],
- }
+ actual_page = self.search.origin_search(url_pattern="foo bar baz")
+ assert actual_page.next_page_token is None
+ assert set(actual_page.results) == set([origin_foo_bar_baz])
def test_origin_with_visit(self):
+ origin_foobar_baz = Origin(url="http://foobar/baz")
+
self.search.origin_update(
- [{"url": "http://foobar.baz", "has_visits": True},]
+ [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
def test_origin_with_visit_added(self):
- self.search.origin_update(
- [{"url": "http://foobar.baz"},]
- )
+ origin_foobar_baz = Origin(url="http://foobar.baz")
+
+ self.search.origin_update([o.to_dict() for o in [origin_foobar_baz]])
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {"next_page_token": None, "results": []}
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == []
self.search.origin_update(
- [{"url": "http://foobar.baz", "has_visits": True},]
+ [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]]
)
self.search.flush()
- results = self.search.origin_search(url_pattern="foobar", with_visit=True)
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://foobar.baz"}],
- }
+ actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True)
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin_foobar_baz]
def test_origin_intrinsic_metadata_description(self):
+ origin1_nothin = Origin(url="http://origin1")
+ origin2_foobar = Origin(url="http://origin2")
+ origin3_barbaz = Origin(url="http://origin3")
+
self.search.origin_update(
[
- {"url": "http://origin1", "intrinsic_metadata": {},},
+ {"url": origin1_nothin.url, "intrinsic_metadata": {},},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar",
},
},
{
- "url": "http://origin3",
+ "url": origin3_barbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "bar baz",
@@ -135,36 +124,34 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="foo bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="bar baz")
+
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_barbaz]
def test_origin_intrinsic_metadata_all_terms(self):
+ origin1_foobarfoobar = Origin(url="http://origin1")
+ origin3_foobarbaz = Origin(url="http://origin2")
+
self.search.origin_update(
[
{
- "url": "http://origin1",
+ "url": origin1_foobarfoobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar foo bar",
},
},
{
- "url": "http://origin3",
+ "url": origin3_foobarbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"description": "foo bar baz",
@@ -174,25 +161,27 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_foobarbaz]
def test_origin_intrinsic_metadata_nested(self):
+ origin1_nothin = Origin(url="http://origin1")
+ origin2_foobar = Origin(url="http://origin2")
+ origin3_barbaz = Origin(url="http://origin3")
+
self.search.origin_update(
[
- {"url": "http://origin1", "intrinsic_metadata": {},},
+ {"url": origin1_nothin.url, "intrinsic_metadata": {},},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
- "url": "http://origin3",
+ "url": origin3_barbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["bar", "baz"],
@@ -202,23 +191,17 @@
)
self.search.flush()
- results = self.search.origin_search(metadata_pattern="foo")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="foo bar")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin2"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="foo bar")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin2_foobar]
- results = self.search.origin_search(metadata_pattern="bar baz")
- assert results == {
- "next_page_token": None,
- "results": [{"url": "http://origin3"}],
- }
+ actual_page = self.search.origin_search(metadata_pattern="bar baz")
+ assert actual_page.next_page_token is None
+ assert actual_page.results == [origin3_barbaz]
# TODO: add more tests with more codemeta terms
@@ -226,71 +209,60 @@
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
- def test_origin_url_paging(self, count):
+ def test_origin_url_paging(self, limit):
# TODO: no hypothesis
+ origin1_foo = Origin(url="http://origin1/foo")
+ origin2_foobar = Origin(url="http://origin2/foo/bar")
+ origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz")
+
self.reset()
self.search.origin_update(
- [
- {"url": "http://origin1/foo"},
- {"url": "http://origin2/foo/bar"},
- {"url": "http://origin3/foo/bar/baz"},
- ]
+ [o.to_dict() for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]]
)
self.search.flush()
results = stream_results(
- self.search.origin_search, url_pattern="foo bar baz", count=count
+ self.search.origin_search, url_pattern="foo bar baz", limit=limit
)
- results = [res["url"] for res in results]
- expected_results = [
- "http://origin3/foo/bar/baz",
- ]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, url_pattern="foo bar", count=count
+ self.search.origin_search, url_pattern="foo bar", limit=limit
)
- expected_results = [
- "http://origin2/foo/bar",
- "http://origin3/foo/bar/baz",
- ]
- results = [res["url"] for res in results]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin2_foobar, origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, url_pattern="foo", count=count
+ self.search.origin_search, url_pattern="foo", limit=limit
)
- expected_results = [
- "http://origin1/foo",
- "http://origin2/foo/bar",
- "http://origin3/foo/bar/baz",
- ]
- results = [res["url"] for res in results]
- assert sorted(results[0 : len(expected_results)]) == sorted(expected_results)
+ assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz])
@settings(deadline=None)
@given(strategies.integers(min_value=1, max_value=4))
- def test_origin_intrinsic_metadata_paging(self, count):
+ def test_origin_intrinsic_metadata_paging(self, limit):
# TODO: no hypothesis
+ origin1_foo = Origin(url="http://origin1/foo")
+ origin2_foobar = Origin(url="http://origin2/foo/bar")
+ origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz")
+
self.reset()
self.search.origin_update(
[
{
- "url": "http://origin1",
+ "url": origin1_foo.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo"],
},
},
{
- "url": "http://origin2",
+ "url": origin2_foobar.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar"],
},
},
{
- "url": "http://origin3",
+ "url": origin3_foobarbaz.url,
"intrinsic_metadata": {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"keywords": ["foo", "bar", "baz"],
@@ -301,20 +273,16 @@
self.search.flush()
results = stream_results(
- self.search.origin_search, metadata_pattern="foo bar baz", count=count
+ self.search.origin_search, metadata_pattern="foo bar baz", limit=limit
)
- assert list(results) == [{"url": "http://origin3"}]
+ assert set(results) == set([origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, metadata_pattern="foo bar", count=count
+ self.search.origin_search, metadata_pattern="foo bar", limit=limit
)
- assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}]
+ assert set(results) == set([origin2_foobar, origin3_foobarbaz])
results = stream_results(
- self.search.origin_search, metadata_pattern="foo", count=count
+ self.search.origin_search, metadata_pattern="foo", limit=limit
)
- assert list(results) == [
- {"url": "http://origin1"},
- {"url": "http://origin2"},
- {"url": "http://origin3"},
- ]
+ assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz])
diff --git a/swh/search/tests/test_serializers.py b/swh/search/tests/test_serializers.py
new file mode 100644
--- /dev/null
+++ b/swh/search/tests/test_serializers.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from swh.model.model import Origin
+
+from swh.search.api.serializers import _encode_model_object, _decode_model_object
+
+
+def test_serialization_back_and_forth():
+ """Testing serialization on origin back and forth should be fine
+
+ """
+ origin = Origin(url="foobar")
+
+ assert _decode_model_object(_encode_model_object(origin)) == origin
diff --git a/swh/search/utils.py b/swh/search/utils.py
--- a/swh/search/utils.py
+++ b/swh/search/utils.py
@@ -1,16 +1,19 @@
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def stream_results(f, *args, **kwargs):
+ """Consume the paginated result and stream it directly
+
+ """
if "page_token" in kwargs:
raise TypeError('stream_results has no argument "page_token".')
page_token = None
while True:
results = f(*args, page_token=page_token, **kwargs)
- yield from results["results"]
- page_token = results["next_page_token"]
+ yield from results.results
+ page_token = results.next_page_token
if page_token is None:
break

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 17, 4:40 PM (3 h, 56 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229296

Event Timeline