Changeset View
Standalone View
swh/search/tests/test_search.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from hypothesis import given, strategies, settings | from hypothesis import given, strategies, settings | ||||
from swh.search.utils import stream_results | from swh.search.utils import stream_results | ||||
from swh.model.model import Origin | |||||
class CommonSearchTest: | class CommonSearchTest: | ||||
def test_origin_url_unique_word_prefix(self): | def test_origin_url_unique_word_prefix(self): | ||||
self.search.origin_update( | origin_foobar_baz = Origin(url="http://foobar.baz") | ||||
[ | origin_barbaz_qux = Origin(url="http://barbaz.qux") | ||||
{"url": "http://foobar.baz"}, | origin_qux_quux = Origin(url="http://qux.quux") | ||||
{"url": "http://barbaz.qux"}, | origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] | ||||
{"url": "http://qux.quux"}, | |||||
] | self.search.origin_update([o.to_dict() for o in origins]) | ||||
) | |||||
self.search.flush() | self.search.flush() | ||||
vlorentz: I think you should include the TLD in the variable name (eg. `origin_foobar_baz`), so it's… | |||||
results = self.search.origin_search(url_pattern="foobar") | actual_page = self.search.origin_search(url_pattern="foobar") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | actual_page = self.search.origin_search(url_pattern="barb") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(url_pattern="barb") | assert actual_page.results == [origin_barbaz_qux] | ||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | |||||
# 'bar' is part of 'foobar', but is not the beginning of it | # 'bar' is part of 'foobar', but is not the beginning of it | ||||
results = self.search.origin_search(url_pattern="bar") | actual_page = self.search.origin_search(url_pattern="bar") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_barbaz_qux] | ||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | actual_page = self.search.origin_search(url_pattern="barbaz") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(url_pattern="barbaz") | assert actual_page.results == [origin_barbaz_qux] | ||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | |||||
def test_origin_url_unique_word_prefix_multiple_results(self): | def test_origin_url_unique_word_prefix_multiple_results(self): | ||||
self.search.origin_update( | origin_foobar_baz = Origin(url="http://foobar.baz") | ||||
[ | origin_barbaz_qux = Origin(url="http://barbaz.qux") | ||||
{"url": "http://foobar.baz"}, | origin_qux_quux = Origin(url="http://qux.quux") | ||||
{"url": "http://barbaz.qux"}, | origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] | ||||
{"url": "http://qux.quux"}, | |||||
] | self.search.origin_update([o.to_dict() for o in origins]) | ||||
) | |||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="qu") | actual_page = self.search.origin_search(url_pattern="qu") | ||||
assert results["next_page_token"] is None | assert actual_page.next_page_token is None | ||||
assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux]) | |||||
results = [res["url"] for res in results["results"]] | actual_page = self.search.origin_search(url_pattern="qux") | ||||
expected_results = ["http://qux.quux", "http://barbaz.qux"] | assert actual_page.next_page_token is None | ||||
assert sorted(results) == sorted(expected_results) | assert set(actual_page.results) == set([origin_qux_quux, origin_barbaz_qux]) | ||||
results = self.search.origin_search(url_pattern="qux") | |||||
assert results["next_page_token"] is None | |||||
results = [res["url"] for res in results["results"]] | |||||
expected_results = ["http://barbaz.qux", "http://qux.quux"] | |||||
assert sorted(results) == sorted(expected_results) | |||||
def test_origin_url_all_terms(self): | def test_origin_url_all_terms(self): | ||||
self.search.origin_update( | origin_foo_bar_baz = Origin(url="http://foo.bar/baz") | ||||
[{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},] | origin_foo_bar_foo_bar = Origin(url="http://foo.bar/foo.bar") | ||||
) | origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar] | ||||
self.search.origin_update([o.to_dict() for o in origins]) | |||||
self.search.flush() | self.search.flush() | ||||
# Only results containing all terms should be returned. | # Only results containing all terms should be returned. | ||||
results = self.search.origin_search(url_pattern="foo bar baz") | actual_page = self.search.origin_search(url_pattern="foo bar baz") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert set(actual_page.results) == set([origin_foo_bar_baz]) | ||||
"results": [{"url": "http://foo.bar/baz"},], | |||||
} | |||||
def test_origin_with_visit(self): | def test_origin_with_visit(self): | ||||
origin_foobar_baz = Origin(url="http://foobar/baz") | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[{"url": "http://foobar.baz", "has_visits": True},] | [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | |||||
def test_origin_with_visit_added(self): | def test_origin_with_visit_added(self): | ||||
self.search.origin_update( | origin_foobar_baz = Origin(url="http://foobar.baz") | ||||
[{"url": "http://foobar.baz"},] | |||||
) | self.search.origin_update([o.to_dict() for o in [origin_foobar_baz]]) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == {"next_page_token": None, "results": []} | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [] | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[{"url": "http://foobar.baz", "has_visits": True},] | [{**o.to_dict(), "has_visits": True} for o in [origin_foobar_baz]] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_description(self): | def test_origin_intrinsic_metadata_description(self): | ||||
origin1_nothin = Origin(url="http://origin1") | |||||
origin2_foobar = Origin(url="http://origin2") | |||||
origin3_barbaz = Origin(url="http://origin3") | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{"url": "http://origin1", "intrinsic_metadata": {},}, | {"url": origin1_nothin.url, "intrinsic_metadata": {},}, | ||||
{ | { | ||||
"url": "http://origin2", | "url": origin2_foobar.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar", | "description": "foo bar", | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | "url": origin3_barbaz.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "bar baz", | "description": "bar baz", | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo") | actual_page = self.search.origin_search(metadata_pattern="foo") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin2_foobar] | ||||
"results": [{"url": "http://origin2"}], | |||||
} | actual_page = self.search.origin_search(metadata_pattern="foo bar") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(metadata_pattern="foo bar") | assert actual_page.results == [origin2_foobar] | ||||
assert results == { | |||||
"next_page_token": None, | actual_page = self.search.origin_search(metadata_pattern="bar baz") | ||||
"results": [{"url": "http://origin2"}], | |||||
} | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin3_barbaz] | |||||
results = self.search.origin_search(metadata_pattern="bar baz") | |||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_all_terms(self): | def test_origin_intrinsic_metadata_all_terms(self): | ||||
origin1_foobarfoobar = Origin(url="http://origin1") | |||||
origin3_foobarbaz = Origin(url="http://origin2") | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{ | { | ||||
"url": "http://origin1", | "url": origin1_foobarfoobar.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar foo bar", | "description": "foo bar foo bar", | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | "url": origin3_foobarbaz.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar baz", | "description": "foo bar baz", | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo bar baz") | actual_page = self.search.origin_search(metadata_pattern="foo bar baz") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin3_foobarbaz] | ||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_nested(self): | def test_origin_intrinsic_metadata_nested(self): | ||||
origin1_nothin = Origin(url="http://origin1") | |||||
origin2_foobar = Origin(url="http://origin2") | |||||
origin3_barbaz = Origin(url="http://origin3") | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{"url": "http://origin1", "intrinsic_metadata": {},}, | {"url": origin1_nothin.url, "intrinsic_metadata": {},}, | ||||
{ | { | ||||
"url": "http://origin2", | "url": origin2_foobar.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar"], | "keywords": ["foo", "bar"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | "url": origin3_barbaz.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["bar", "baz"], | "keywords": ["bar", "baz"], | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo") | actual_page = self.search.origin_search(metadata_pattern="foo") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin2_foobar] | ||||
"results": [{"url": "http://origin2"}], | |||||
} | actual_page = self.search.origin_search(metadata_pattern="foo bar") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(metadata_pattern="foo bar") | assert actual_page.results == [origin2_foobar] | ||||
assert results == { | |||||
"next_page_token": None, | actual_page = self.search.origin_search(metadata_pattern="bar baz") | ||||
"results": [{"url": "http://origin2"}], | assert actual_page.next_page_token is None | ||||
} | assert actual_page.results == [origin3_barbaz] | ||||
results = self.search.origin_search(metadata_pattern="bar baz") | |||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
# TODO: add more tests with more codemeta terms | # TODO: add more tests with more codemeta terms | ||||
# TODO: add more tests with edge cases | # TODO: add more tests with edge cases | ||||
@settings(deadline=None) | @settings(deadline=None) | ||||
@given(strategies.integers(min_value=1, max_value=4)) | @given(strategies.integers(min_value=1, max_value=4)) | ||||
def test_origin_url_paging(self, count): | def test_origin_url_paging(self, limit): | ||||
# TODO: no hypothesis | # TODO: no hypothesis | ||||
origin1_foo = Origin(url="http://origin1/foo") | |||||
origin2_foobar = Origin(url="http://origin2/foo/bar") | |||||
origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz") | |||||
self.reset() | self.reset() | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [o.to_dict() for o in [origin1_foo, origin2_foobar, origin3_foobarbaz]] | ||||
{"url": "http://origin1/foo"}, | |||||
{"url": "http://origin2/foo/bar"}, | |||||
{"url": "http://origin3/foo/bar/baz"}, | |||||
] | |||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo bar baz", count=count | self.search.origin_search, url_pattern="foo bar baz", limit=limit | ||||
) | ) | ||||
results = [res["url"] for res in results] | assert set(results) == set([origin3_foobarbaz]) | ||||
expected_results = [ | |||||
"http://origin3/foo/bar/baz", | |||||
] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | |||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo bar", count=count | self.search.origin_search, url_pattern="foo bar", limit=limit | ||||
) | ) | ||||
expected_results = [ | assert set(results) == set([origin2_foobar, origin3_foobarbaz]) | ||||
"http://origin2/foo/bar", | |||||
"http://origin3/foo/bar/baz", | |||||
] | |||||
results = [res["url"] for res in results] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | |||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo", count=count | self.search.origin_search, url_pattern="foo", limit=limit | ||||
) | ) | ||||
expected_results = [ | assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz]) | ||||
"http://origin1/foo", | |||||
"http://origin2/foo/bar", | |||||
"http://origin3/foo/bar/baz", | |||||
] | |||||
results = [res["url"] for res in results] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | |||||
Done Inline Actions
In this test method, this does not look like the order matters. ardumont: > Errrr, actually, no. The order of results matters, and tests must check it
In this test… | |||||
Done Inline Actions
vlorentz answered elsewhere, here goes.
ardumont: >> What did I miss?
vlorentz answered elsewhere, here goes.
> vlorentz: finding a word in a… | |||||
@settings(deadline=None) | @settings(deadline=None) | ||||
@given(strategies.integers(min_value=1, max_value=4)) | @given(strategies.integers(min_value=1, max_value=4)) | ||||
def test_origin_intrinsic_metadata_paging(self, count): | def test_origin_intrinsic_metadata_paging(self, limit): | ||||
# TODO: no hypothesis | # TODO: no hypothesis | ||||
origin1_foo = Origin(url="http://origin1/foo") | |||||
origin2_foobar = Origin(url="http://origin2/foo/bar") | |||||
origin3_foobarbaz = Origin(url="http://origin3/foo/bar/baz") | |||||
Done Inline ActionsI tried to replace this with pytest.mark.parametrize but that does not work... [1] I think it's not working directly on method classes so later i guess. ardumont: I tried to replace this with pytest.mark.parametrize but that does not work... [1]
I think… | |||||
Not Done Inline Actionsfinding a word in a short URL is a better word than finding the same word in a long URL. vlorentz: finding a word in a short URL is a better word than finding the same word in a long URL. | |||||
Not Done Inline Actionsbetter match * vlorentz: better match * | |||||
Done Inline Actionsin what context are you telling me this? I don't get it. (My initial parametrize comment was about the todo to drop hypothesis and try to use pytest's parametrize instead). ardumont: in what context are you telling me this?
I don't get it.
(My initial parametrize comment was… | |||||
Not Done Inline Actionssorry, I confused this comment with "In this test method, this does not look like the order matters. vlorentz: sorry, I confused this comment with "In this test method, this does not look like the order… | |||||
self.reset() | self.reset() | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{ | { | ||||
"url": "http://origin1", | "url": origin1_foo.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo"], | "keywords": ["foo"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin2", | "url": origin2_foobar.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar"], | "keywords": ["foo", "bar"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | "url": origin3_foobarbaz.url, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar", "baz"], | "keywords": ["foo", "bar", "baz"], | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo bar baz", count=count | self.search.origin_search, metadata_pattern="foo bar baz", limit=limit | ||||
) | ) | ||||
assert list(results) == [{"url": "http://origin3"}] | assert set(results) == set([origin3_foobarbaz]) | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo bar", count=count | self.search.origin_search, metadata_pattern="foo bar", limit=limit | ||||
) | ) | ||||
assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}] | assert set(results) == set([origin2_foobar, origin3_foobarbaz]) | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo", count=count | self.search.origin_search, metadata_pattern="foo", limit=limit | ||||
) | ) | ||||
assert list(results) == [ | assert set(results) == set([origin1_foo, origin2_foobar, origin3_foobarbaz]) | ||||
{"url": "http://origin1"}, | |||||
{"url": "http://origin2"}, | |||||
{"url": "http://origin3"}, | |||||
] |
I think you should include the TLD in the variable name (eg. origin_foobar_baz), so it's clearer below why it matches.
If you disagree, at least rename origin_quux to origin_qux