Changeset View
Standalone View
swh/search/tests/test_search.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from hypothesis import given, strategies, settings | from hypothesis import given, strategies, settings | ||||
from swh.search.utils import stream_results | from swh.search.utils import stream_results | ||||
class CommonSearchTest: | class CommonSearchTest: | ||||
def test_origin_url_unique_word_prefix(self): | def test_origin_url_unique_word_prefix(self): | ||||
self.search.origin_update( | origin_foobar_baz = {"url": "http://foobar.baz"} | ||||
[ | origin_barbaz_qux = {"url": "http://barbaz.qux"} | ||||
{"url": "http://foobar.baz"}, | origin_qux_quux = {"url": "http://qux.quux"} | ||||
{"url": "http://barbaz.qux"}, | origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] | ||||
{"url": "http://qux.quux"}, | |||||
] | self.search.origin_update(origins) | ||||
) | |||||
self.search.flush() | self.search.flush() | ||||
vlorentz: I think you should include the TLD in the variable name (eg. `origin_foobar_baz`), so it's… | |||||
results = self.search.origin_search(url_pattern="foobar") | actual_page = self.search.origin_search(url_pattern="foobar") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | actual_page = self.search.origin_search(url_pattern="barb") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(url_pattern="barb") | assert actual_page.results == [origin_barbaz_qux] | ||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | |||||
# 'bar' is part of 'foobar', but is not the beginning of it | # 'bar' is part of 'foobar', but is not the beginning of it | ||||
results = self.search.origin_search(url_pattern="bar") | actual_page = self.search.origin_search(url_pattern="bar") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_barbaz_qux] | ||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | actual_page = self.search.origin_search(url_pattern="barbaz") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(url_pattern="barbaz") | assert actual_page.results == [origin_barbaz_qux] | ||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://barbaz.qux"}], | |||||
} | |||||
def test_origin_url_unique_word_prefix_multiple_results(self): | def test_origin_url_unique_word_prefix_multiple_results(self): | ||||
origin_foobar_baz = {"url": "http://foobar.baz"} | |||||
origin_barbaz_qux = {"url": "http://barbaz.qux"} | |||||
origin_qux_quux = {"url": "http://qux.quux"} | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] | ||||
{"url": "http://foobar.baz"}, | |||||
{"url": "http://barbaz.qux"}, | |||||
{"url": "http://qux.quux"}, | |||||
] | |||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="qu") | actual_page = self.search.origin_search(url_pattern="qu") | ||||
assert results["next_page_token"] is None | assert actual_page.next_page_token is None | ||||
results = [r["url"] for r in actual_page.results] | |||||
results = [res["url"] for res in results["results"]] | expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] | ||||
expected_results = ["http://qux.quux", "http://barbaz.qux"] | |||||
assert sorted(results) == sorted(expected_results) | assert sorted(results) == sorted(expected_results) | ||||
results = self.search.origin_search(url_pattern="qux") | actual_page = self.search.origin_search(url_pattern="qux") | ||||
assert results["next_page_token"] is None | assert actual_page.next_page_token is None | ||||
results = [r["url"] for r in actual_page.results] | |||||
results = [res["url"] for res in results["results"]] | expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] | ||||
expected_results = ["http://barbaz.qux", "http://qux.quux"] | |||||
assert sorted(results) == sorted(expected_results) | assert sorted(results) == sorted(expected_results) | ||||
def test_origin_url_all_terms(self): | def test_origin_url_all_terms(self): | ||||
self.search.origin_update( | origin_foo_bar_baz = {"url": "http://foo.bar/baz"} | ||||
[{"url": "http://foo.bar/baz"}, {"url": "http://foo.bar/foo.bar"},] | origin_foo_bar_foo_bar = {"url": "http://foo.bar/foo.bar"} | ||||
) | origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar] | ||||
self.search.origin_update(origins) | |||||
self.search.flush() | self.search.flush() | ||||
# Only results containing all terms should be returned. | # Only results containing all terms should be returned. | ||||
results = self.search.origin_search(url_pattern="foo bar baz") | actual_page = self.search.origin_search(url_pattern="foo bar baz") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foo_bar_baz] | ||||
"results": [{"url": "http://foo.bar/baz"},], | |||||
} | |||||
def test_origin_with_visit(self): | def test_origin_with_visit(self): | ||||
origin_foobar_baz = {"url": "http://foobar/baz"} | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[{"url": "http://foobar.baz", "has_visits": True},] | [{**o, "has_visits": True} for o in [origin_foobar_baz]] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | |||||
def test_origin_with_visit_added(self): | def test_origin_with_visit_added(self): | ||||
self.search.origin_update( | origin_foobar_baz = {"url": "http://foobar.baz"} | ||||
[{"url": "http://foobar.baz"},] | |||||
) | self.search.origin_update([origin_foobar_baz]) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == {"next_page_token": None, "results": []} | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [] | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[{"url": "http://foobar.baz", "has_visits": True},] | [{**o, "has_visits": True} for o in [origin_foobar_baz]] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin_foobar_baz] | ||||
"results": [{"url": "http://foobar.baz"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_description(self): | def test_origin_intrinsic_metadata_description(self): | ||||
origin1_nothin = {"url": "http://origin1"} | |||||
origin2_foobar = {"url": "http://origin2"} | |||||
origin3_barbaz = {"url": "http://origin3"} | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{"url": "http://origin1", "intrinsic_metadata": {},}, | {**origin1_nothin, "intrinsic_metadata": {},}, | ||||
{ | { | ||||
"url": "http://origin2", | **origin2_foobar, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar", | "description": "foo bar", | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | **origin3_barbaz, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "bar baz", | "description": "bar baz", | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo") | actual_page = self.search.origin_search(metadata_pattern="foo") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin2_foobar] | ||||
"results": [{"url": "http://origin2"}], | |||||
} | actual_page = self.search.origin_search(metadata_pattern="foo bar") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(metadata_pattern="foo bar") | assert actual_page.results == [origin2_foobar] | ||||
assert results == { | |||||
"next_page_token": None, | actual_page = self.search.origin_search(metadata_pattern="bar baz") | ||||
"results": [{"url": "http://origin2"}], | assert actual_page.next_page_token is None | ||||
} | assert actual_page.results == [origin3_barbaz] | ||||
results = self.search.origin_search(metadata_pattern="bar baz") | |||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_all_terms(self): | def test_origin_intrinsic_metadata_all_terms(self): | ||||
origin1_foobarfoobar = {"url": "http://origin1"} | |||||
origin3_foobarbaz = {"url": "http://origin2"} | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{ | { | ||||
"url": "http://origin1", | **origin1_foobarfoobar, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar foo bar", | "description": "foo bar foo bar", | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | **origin3_foobarbaz, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"description": "foo bar baz", | "description": "foo bar baz", | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo bar baz") | actual_page = self.search.origin_search(metadata_pattern="foo bar baz") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin3_foobarbaz] | ||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
def test_origin_intrinsic_metadata_nested(self): | def test_origin_intrinsic_metadata_nested(self): | ||||
origin1_nothin = {"url": "http://origin1"} | |||||
origin2_foobar = {"url": "http://origin2"} | |||||
origin3_barbaz = {"url": "http://origin3"} | |||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{"url": "http://origin1", "intrinsic_metadata": {},}, | {**origin1_nothin, "intrinsic_metadata": {},}, | ||||
{ | { | ||||
"url": "http://origin2", | **origin2_foobar, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar"], | "keywords": ["foo", "bar"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | **origin3_barbaz, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["bar", "baz"], | "keywords": ["bar", "baz"], | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = self.search.origin_search(metadata_pattern="foo") | actual_page = self.search.origin_search(metadata_pattern="foo") | ||||
assert results == { | assert actual_page.next_page_token is None | ||||
"next_page_token": None, | assert actual_page.results == [origin2_foobar] | ||||
"results": [{"url": "http://origin2"}], | |||||
} | actual_page = self.search.origin_search(metadata_pattern="foo bar") | ||||
assert actual_page.next_page_token is None | |||||
results = self.search.origin_search(metadata_pattern="foo bar") | assert actual_page.results == [origin2_foobar] | ||||
assert results == { | |||||
"next_page_token": None, | actual_page = self.search.origin_search(metadata_pattern="bar baz") | ||||
"results": [{"url": "http://origin2"}], | assert actual_page.next_page_token is None | ||||
} | assert actual_page.results == [origin3_barbaz] | ||||
results = self.search.origin_search(metadata_pattern="bar baz") | |||||
assert results == { | |||||
"next_page_token": None, | |||||
"results": [{"url": "http://origin3"}], | |||||
} | |||||
# TODO: add more tests with more codemeta terms | # TODO: add more tests with more codemeta terms | ||||
# TODO: add more tests with edge cases | # TODO: add more tests with edge cases | ||||
@settings(deadline=None) | @settings(deadline=None) | ||||
@given(strategies.integers(min_value=1, max_value=4)) | @given(strategies.integers(min_value=1, max_value=4)) | ||||
def test_origin_url_paging(self, count): | def test_origin_url_paging(self, limit): | ||||
# TODO: no hypothesis | # TODO: no hypothesis | ||||
origin1_foo = {"url": "http://origin1/foo"} | |||||
origin2_foobar = {"url": "http://origin2/foo/bar"} | |||||
origin3_foobarbaz = {"url": "http://origin3/foo/bar/baz"} | |||||
self.reset() | self.reset() | ||||
self.search.origin_update( | self.search.origin_update([origin1_foo, origin2_foobar, origin3_foobarbaz]) | ||||
[ | |||||
{"url": "http://origin1/foo"}, | |||||
{"url": "http://origin2/foo/bar"}, | |||||
{"url": "http://origin3/foo/bar/baz"}, | |||||
] | |||||
) | |||||
self.search.flush() | self.search.flush() | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo bar baz", count=count | self.search.origin_search, url_pattern="foo bar baz", limit=limit | ||||
) | ) | ||||
results = [res["url"] for res in results] | results = [res["url"] for res in results] | ||||
expected_results = [ | expected_results = [o["url"] for o in [origin3_foobarbaz]] | ||||
"http://origin3/foo/bar/baz", | |||||
] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo bar", count=count | self.search.origin_search, url_pattern="foo bar", limit=limit | ||||
) | ) | ||||
expected_results = [ | |||||
"http://origin2/foo/bar", | |||||
"http://origin3/foo/bar/baz", | |||||
] | |||||
results = [res["url"] for res in results] | results = [res["url"] for res in results] | ||||
expected_results = [o["url"] for o in [origin2_foobar, origin3_foobarbaz]] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, url_pattern="foo", count=count | self.search.origin_search, url_pattern="foo", limit=limit | ||||
) | ) | ||||
results = [res["url"] for res in results] | |||||
expected_results = [ | expected_results = [ | ||||
"http://origin1/foo", | o["url"] for o in [origin1_foo, origin2_foobar, origin3_foobarbaz] | ||||
"http://origin2/foo/bar", | |||||
"http://origin3/foo/bar/baz", | |||||
] | ] | ||||
results = [res["url"] for res in results] | |||||
assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) | ||||
Done Inline Actions
In this test method, this does not look like the order matters. ardumont: > Errrr, actually, no. The order of results matters, and tests must check it
In this test… | |||||
Done Inline Actions
vlorentz answered elsewhere, here goes.
ardumont: >> What did I miss?
vlorentz answered elsewhere, here goes.
> vlorentz: finding a word in a… | |||||
@settings(deadline=None) | @settings(deadline=None) | ||||
@given(strategies.integers(min_value=1, max_value=4)) | @given(strategies.integers(min_value=1, max_value=4)) | ||||
def test_origin_intrinsic_metadata_paging(self, count): | def test_origin_intrinsic_metadata_paging(self, limit): | ||||
# TODO: no hypothesis | # TODO: no hypothesis | ||||
origin1_foo = {"url": "http://origin1"} | |||||
origin2_foobar = {"url": "http://origin2"} | |||||
origin3_foobarbaz = {"url": "http://origin3"} | |||||
Done Inline ActionsI tried to replace this with pytest.mark.parametrize but that does not work... [1] I think it's not working directly on method classes so later i guess. ardumont: I tried to replace this with pytest.mark.parametrize but that does not work... [1]
I think… | |||||
Not Done Inline Actionsfinding a word in a short URL is a better word than finding the same word in a long URL. vlorentz: finding a word in a short URL is a better word than finding the same word in a long URL. | |||||
Not Done Inline Actionsbetter match * vlorentz: better match * | |||||
Done Inline Actionsin what context are you telling me this? I don't get it. (My initial parametrize comment was about the todo to drop hypothesis and try to use pytest's parametrize instead). ardumont: in what context are you telling me this?
I don't get it.
(My initial parametrize comment was… | |||||
Not Done Inline Actionssorry, I confused this comment with "In this test method, this does not look like the order matters. vlorentz: sorry, I confused this comment with "In this test method, this does not look like the order… | |||||
self.reset() | self.reset() | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{ | { | ||||
"url": "http://origin1", | **origin1_foo, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo"], | "keywords": ["foo"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin2", | **origin2_foobar, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar"], | "keywords": ["foo", "bar"], | ||||
}, | }, | ||||
}, | }, | ||||
{ | { | ||||
"url": "http://origin3", | **origin3_foobarbaz, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"keywords": ["foo", "bar", "baz"], | "keywords": ["foo", "bar", "baz"], | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo bar baz", count=count | self.search.origin_search, metadata_pattern="foo bar baz", limit=limit | ||||
) | ) | ||||
assert list(results) == [{"url": "http://origin3"}] | assert list(results) == [origin3_foobarbaz] | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo bar", count=count | self.search.origin_search, metadata_pattern="foo bar", limit=limit | ||||
) | ) | ||||
assert list(results) == [{"url": "http://origin2"}, {"url": "http://origin3"}] | assert list(results) == [origin2_foobar, origin3_foobarbaz] | ||||
results = stream_results( | results = stream_results( | ||||
self.search.origin_search, metadata_pattern="foo", count=count | self.search.origin_search, metadata_pattern="foo", limit=limit | ||||
) | ) | ||||
assert list(results) == [ | assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz] | ||||
{"url": "http://origin1"}, | |||||
{"url": "http://origin2"}, | |||||
{"url": "http://origin3"}, | |||||
] |
I think you should include the TLD in the variable name (eg. origin_foobar_baz), so it's clearer below why it matches.
If you disagree, at least rename origin_quux to origin_qux