Changeset View
Changeset View
Standalone View
Standalone View
swh/search/tests/test_search.py
# Copyright (C) 2019-2020 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from hypothesis import given, settings, strategies | from hypothesis import given, settings, strategies | ||||
from swh.core.api.classes import stream_results | from swh.core.api.classes import stream_results | ||||
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines | def test_origin_with_visit_added(self): | ||||
[{**o, "has_visits": True} for o in [origin_foobar_baz]] | [{**o, "has_visits": True} for o in [origin_foobar_baz]] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) | ||||
assert actual_page.next_page_token is None | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin_foobar_baz] | assert actual_page.results == [origin_foobar_baz] | ||||
def test_origin_no_visit_types_search(self): | |||||
origins = [{"url": "http://foobar.baz"}] | |||||
self.search.origin_update(origins) | |||||
self.search.flush() | |||||
actual_page = self.search.origin_search(url_pattern="http", visit_types=["git"]) | |||||
assert actual_page.next_page_token is None | |||||
results = [r["url"] for r in actual_page.results] | |||||
expected_results = [] | |||||
assert sorted(results) == sorted(expected_results) | |||||
actual_page = self.search.origin_search(url_pattern="http", visit_types=None) | |||||
assert actual_page.next_page_token is None | |||||
results = [r["url"] for r in actual_page.results] | |||||
expected_results = [origin["url"] for origin in origins] | |||||
assert sorted(results) == sorted(expected_results) | |||||
def test_origin_visit_types_search(self): | |||||
origins = [ | |||||
{"url": "http://foobar.baz", "visit_types": ["git"]}, | |||||
{"url": "http://barbaz.qux", "visit_types": ["svn"]}, | |||||
{"url": "http://qux.quux", "visit_types": ["hg"]}, | |||||
] | |||||
self.search.origin_update(origins) | |||||
self.search.flush() | |||||
for origin in origins: | |||||
actual_page = self.search.origin_search( | |||||
url_pattern="http", visit_types=origin["visit_types"] | |||||
) | |||||
vlorentz: The function name should mention updates (as it's what it's really testing); and there should… | |||||
Done Inline Actionsack anlambert: ack | |||||
assert actual_page.next_page_token is None | |||||
results = [r["url"] for r in actual_page.results] | |||||
expected_results = [origin["url"]] | |||||
assert sorted(results) == sorted(expected_results) | |||||
actual_page = self.search.origin_search(url_pattern="http", visit_types=None) | |||||
assert actual_page.next_page_token is None | |||||
results = [r["url"] for r in actual_page.results] | |||||
expected_results = [origin["url"] for origin in origins] | |||||
assert sorted(results) == sorted(expected_results) | |||||
def test_origin_visit_types_update_search(self): | |||||
origin_url = "http://foobar.baz" | |||||
self.search.origin_update([{"url": origin_url}]) | |||||
self.search.flush() | |||||
def _update_and_check_visit_types(new_visit_type, visit_types_list): | |||||
self.search.origin_update( | |||||
[{"url": origin_url, "visit_types": [new_visit_type]}] | |||||
) | |||||
self.search.flush() | |||||
for visit_types in visit_types_list: | |||||
actual_page = self.search.origin_search( | |||||
url_pattern="http", visit_types=visit_types | |||||
) | |||||
assert actual_page.next_page_token is None | |||||
results = [r["url"] for r in actual_page.results] | |||||
Not Done Inline ActionsIMO this is clearer vlorentz: IMO this is clearer | |||||
expected_results = [origin_url] | |||||
assert sorted(results) == sorted(expected_results) | |||||
vlorentzUnsubmitted Not Done Inline Actionscould you split this into two functions? eg. add_visit_type and check_visit_types. vlorentz: could you split this into two functions? eg. `add_visit_type` and `check_visit_types`. | |||||
Not Done Inline ActionsWhat does it add to test_origin_with_multiple_visit_types_search? vlorentz: What does it add to `test_origin_with_multiple_visit_types_search`? | |||||
Done Inline ActionsIt tests combination of visit types when searching but I guess I could merge that test code in the one above. anlambert: It tests combination of visit types when searching but I guess I could merge that test code in… | |||||
_update_and_check_visit_types( | |||||
new_visit_type="git", visit_types_list=[["git"], ["git", "hg"]] | |||||
) | |||||
_update_and_check_visit_types( | |||||
Done Inline ActionsI find this to be harder to read than the code being tested. Could you keep it simpler, by removing all the loops and writing the origin types explicitly every time? vlorentz: I find this to be harder to read than the code being tested.
Could you keep it simpler, by… | |||||
Done Inline ActionsAck, will simplify a bit anlambert: Ack, will simplify a bit | |||||
new_visit_type="svn", | |||||
visit_types_list=[["git"], ["svn"], ["svn", "git"], ["git", "hg", "svn"]], | |||||
) | |||||
_update_and_check_visit_types( | |||||
new_visit_type="hg", | |||||
visit_types_list=[ | |||||
["git"], | |||||
["svn"], | |||||
["hg"], | |||||
["svn", "git"], | |||||
["hg", "git"], | |||||
["hg", "svn"], | |||||
["git", "hg", "svn"], | |||||
], | |||||
) | |||||
def test_origin_intrinsic_metadata_description(self): | def test_origin_intrinsic_metadata_description(self): | ||||
origin1_nothin = {"url": "http://origin1"} | origin1_nothin = {"url": "http://origin1"} | ||||
origin2_foobar = {"url": "http://origin2"} | origin2_foobar = {"url": "http://origin2"} | ||||
origin3_barbaz = {"url": "http://origin3"} | origin3_barbaz = {"url": "http://origin3"} | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{**origin1_nothin, "intrinsic_metadata": {},}, | {**origin1_nothin, "intrinsic_metadata": {},}, | ||||
▲ Show 20 Lines • Show All 199 Lines • ▼ Show 20 Lines | def test_origin_intrinsic_metadata_inconsistent_type(self): | ||||
actual_page = self.search.origin_search(metadata_pattern="baz qux") | actual_page = self.search.origin_search(metadata_pattern="baz qux") | ||||
assert actual_page.next_page_token is None | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin3_bazqux] | assert actual_page.results == [origin3_bazqux] | ||||
actual_page = self.search.origin_search(metadata_pattern="foo bar") | actual_page = self.search.origin_search(metadata_pattern="foo bar") | ||||
assert actual_page.next_page_token is None | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin1_foobar] | assert actual_page.results == [origin1_foobar] | ||||
def test_origin_intrinsic_metadata_date(self): | def test_origin_intrinsic_metadata_date(self): | ||||
"""Checks inserting a date-like in a field does not update the mapping to | """Checks inserting a date-like in a field does not update the mapping to | ||||
require every document uses a date in that field; or that search queries | require every document uses a date in that field; or that search queries | ||||
use a date either. | use a date either. | ||||
Likewise for numeric fields.""" | Likewise for numeric fields.""" | ||||
origin1 = {"url": "http://origin1"} | origin1 = {"url": "http://origin1"} | ||||
origin2 = {"url": "http://origin2"} | origin2 = {"url": "http://origin2"} | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
{ | { | ||||
**origin1, | **origin1, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"dateCreated": "2021-02-18T10:16:52", | "dateCreated": "2021-02-18T10:16:52", | ||||
"version": "1.0", | "version": "1.0", | ||||
}, | }, | ||||
} | } | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
self.search.origin_update( | self.search.origin_update( | ||||
[ | [ | ||||
Not Done Inline Actionswhy this new test? vlorentz: why this new test? | |||||
Done Inline ActionsAs I have modified the way document gets updated in elasticsearch, I added a new commit with that test to ensure no update regression for metadata. anlambert: As I have modified the way document gets updated in elasticsearch, I added a new commit with… | |||||
{ | { | ||||
**origin2, | **origin2, | ||||
"intrinsic_metadata": { | "intrinsic_metadata": { | ||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||||
"dateCreated": "a long time ago", | "dateCreated": "a long time ago", | ||||
"address": "in a galaxy far, far away", | "address": "in a galaxy far, far away", | ||||
"version": "a new hope", | "version": "a new hope", | ||||
}, | }, | ||||
}, | }, | ||||
] | ] | ||||
) | ) | ||||
self.search.flush() | self.search.flush() | ||||
actual_page = self.search.origin_search(metadata_pattern="2021") | actual_page = self.search.origin_search(metadata_pattern="2021") | ||||
assert actual_page.next_page_token is None | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin1] | assert actual_page.results == [origin1] | ||||
actual_page = self.search.origin_search(metadata_pattern="long time ago") | actual_page = self.search.origin_search(metadata_pattern="long time ago") | ||||
assert actual_page.next_page_token is None | assert actual_page.next_page_token is None | ||||
assert actual_page.results == [origin2] | assert actual_page.results == [origin2] | ||||
def test_origin_intrinsic_metadata_update(self): | |||||
origin = {"url": "http://origin1"} | |||||
origin_data = { | |||||
**origin, | |||||
"intrinsic_metadata": { | |||||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | |||||
"author": "John Doe", | |||||
}, | |||||
} | |||||
self.search.origin_update([origin_data]) | |||||
self.search.flush() | |||||
actual_page = self.search.origin_search(metadata_pattern="John") | |||||
assert actual_page.next_page_token is None | |||||
assert actual_page.results == [origin] | |||||
origin_data["intrinsic_metadata"]["author"] = "Jane Doe" | |||||
self.search.origin_update([origin_data]) | |||||
self.search.flush() | |||||
actual_page = self.search.origin_search(metadata_pattern="Jane") | |||||
assert actual_page.next_page_token is None | |||||
assert actual_page.results == [origin] | |||||
# TODO: add more tests with more codemeta terms | # TODO: add more tests with more codemeta terms | ||||
# TODO: add more tests with edge cases | # TODO: add more tests with edge cases | ||||
@settings(deadline=None) | @settings(deadline=None) | ||||
@given(strategies.integers(min_value=1, max_value=4)) | @given(strategies.integers(min_value=1, max_value=4)) | ||||
def test_origin_url_paging(self, limit): | def test_origin_url_paging(self, limit): | ||||
# TODO: no hypothesis | # TODO: no hypothesis | ||||
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines |
The function name should mention updates (as it's what it's really testing); and there should be a separate test for updating the origins in a separate step (ie. self.search.origin_update([{"url": "http://foobar.baz", "visit_types": ["git", "svn"]}]) + self.search.origin_update([{"url": "http://foobar.baz", "visit_types": ["hg"]}]))