diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index 40b2b4f..cdc34a1 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,198 +1,199 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from typing import Any, Dict, Iterable, Iterator, List, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk, scan import msgpack from swh.indexer import codemeta from swh.model import model from swh.model.identifiers import origin_identifier from swh.search.interface import PagedResult def _sanitize_origin(origin): origin = origin.copy() res = {"url": origin.pop("url")} for field_name in ("intrinsic_metadata", "has_visits"): if field_name in origin: res[field_name] = origin.pop(field_name) if "intrinsic_metadata" in res: res["intrinsic_metadata"] = codemeta.expand(res["intrinsic_metadata"]) return res def token_encode(index_to_tokenize: Dict[bytes, Any]) -> str: """Tokenize as string an index page result from a search """ page_token = base64.b64encode(msgpack.dumps(index_to_tokenize)) return page_token.decode() def token_decode(page_token: str) -> Dict[bytes, Any]: """Read the page_token """ return msgpack.loads(base64.b64decode(page_token.encode()), raw=True) class ElasticSearch: def __init__(self, hosts: List[str]): self._backend = Elasticsearch(hosts=hosts) def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" self._backend.indices.delete(index="*") def initialize(self) -> None: """Declare Elasticsearch indices and mappings""" if not self._backend.indices.exists(index="origin"): self._backend.indices.create(index="origin") self._backend.indices.put_mapping( index="origin", body={ "properties": { "sha1": {"type": "keyword", "doc_values": True,}, "url": { "type": "text", # To split URLs into token on any character # that is not alphanumerical "analyzer": "simple", "fields": { "as_you_type": { "type": "search_as_you_type", "analyzer": "simple", } }, }, "has_visits": {"type": "boolean",}, "intrinsic_metadata": { "type": "nested", "properties": { "@context": { # don't bother indexing tokens "type": "keyword", } }, }, } }, ) def flush(self) -> None: self._backend.indices.refresh(index="_all") def origin_update(self, documents: Iterable[Dict]) -> None: documents = map(_sanitize_origin, documents) documents_with_sha1 = ( (origin_identifier(document), document) for document in documents ) actions = [ { "_op_type": "update", "_id": sha1, "_index": "origin", "doc": {**document, "sha1": sha1,}, "doc_as_upsert": True, } for (sha1, document) in documents_with_sha1 ] bulk(self._backend, actions, index="origin") def origin_dump(self) -> Iterator[model.Origin]: results = scan(self._backend, index="*") for hit in results: yield self._backend.termvectors(index="origin", id=hit["_id"], fields=["*"]) def origin_search( self, *, url_pattern: Optional[str] = None, metadata_pattern: Optional[str] = None, with_visit: bool = False, page_token: Optional[str] = None, limit: int = 50, ) -> PagedResult[Dict[str, Any]]: query_clauses: List[Dict[str, Any]] = [] if url_pattern: query_clauses.append( { "multi_match": { "query": url_pattern, "type": "bool_prefix", "operator": "and", "fields": [ "url.as_you_type", "url.as_you_type._2gram", "url.as_you_type._3gram", ], } } ) if metadata_pattern: query_clauses.append( { "nested": { "path": "intrinsic_metadata", "query": { "multi_match": { "query": metadata_pattern, + "type": "cross_fields", "operator": "and", "fields": ["intrinsic_metadata.*"], } }, } } ) if not query_clauses: raise ValueError( "At least one of url_pattern and metadata_pattern must be provided." ) next_page_token: Optional[str] = None if with_visit: query_clauses.append({"term": {"has_visits": True,}}) body = { "query": {"bool": {"must": query_clauses,}}, "sort": [{"_score": "desc"}, {"sha1": "asc"},], } if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = token_decode(page_token) body["search_after"] = [ page_token_content[b"score"], page_token_content[b"sha1"].decode("ascii"), ] res = self._backend.search(index="origin", body=body, size=limit) hits = res["hits"]["hits"] if len(hits) == limit: last_hit = hits[-1] next_page_token_content = { b"score": last_hit["_score"], b"sha1": last_hit["_source"]["sha1"], } next_page_token = token_encode(next_page_token_content) assert len(hits) <= limit return PagedResult( results=[{"url": hit["_source"]["url"]} for hit in hits], next_page_token=next_page_token, ) diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py index f17872f..e62a669 100644 --- a/swh/search/tests/test_in_memory.py +++ b/swh/search/tests/test_in_memory.py @@ -1,45 +1,49 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.search import get_search from .test_search import CommonSearchTest class InmemorySearchTest(unittest.TestCase, CommonSearchTest): @pytest.fixture(autouse=True) def _instantiate_search(self): self.search = get_search("memory") def setUp(self): self.reset() def reset(self): self.search.deinitialize() self.search.initialize() @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_description(self): pass @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_all_terms(self): pass @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_nested(self): pass @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_paging(self): pass @pytest.mark.skip("Not implemented in the in-memory search") def test_origin_intrinsic_metadata_inconsistent_type(self): pass + + @pytest.mark.skip("Not implemented in the in-memory search") + def test_origin_intrinsic_metadata_matches_cross_fields(self): + pass diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 929d9c8..2abbfcb 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,365 +1,386 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given, settings, strategies from swh.core.api.classes import stream_results class CommonSearchTest: def test_origin_url_unique_word_prefix(self): origin_foobar_baz = {"url": "http://foobar.baz"} origin_barbaz_qux = {"url": "http://barbaz.qux"} origin_qux_quux = {"url": "http://qux.quux"} origins = [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] self.search.origin_update(origins) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar") assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] actual_page = self.search.origin_search(url_pattern="barb") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] # 'bar' is part of 'foobar', but is not the beginning of it actual_page = self.search.origin_search(url_pattern="bar") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] actual_page = self.search.origin_search(url_pattern="barbaz") assert actual_page.next_page_token is None assert actual_page.results == [origin_barbaz_qux] def test_origin_url_unique_word_prefix_multiple_results(self): origin_foobar_baz = {"url": "http://foobar.baz"} origin_barbaz_qux = {"url": "http://barbaz.qux"} origin_qux_quux = {"url": "http://qux.quux"} self.search.origin_update( [origin_foobar_baz, origin_barbaz_qux, origin_qux_quux] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="qu") assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] assert sorted(results) == sorted(expected_results) actual_page = self.search.origin_search(url_pattern="qux") assert actual_page.next_page_token is None results = [r["url"] for r in actual_page.results] expected_results = [o["url"] for o in [origin_qux_quux, origin_barbaz_qux]] assert sorted(results) == sorted(expected_results) def test_origin_url_all_terms(self): origin_foo_bar_baz = {"url": "http://foo.bar/baz"} origin_foo_bar_foo_bar = {"url": "http://foo.bar/foo.bar"} origins = [origin_foo_bar_baz, origin_foo_bar_foo_bar] self.search.origin_update(origins) self.search.flush() # Only results containing all terms should be returned. actual_page = self.search.origin_search(url_pattern="foo bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin_foo_bar_baz] def test_origin_with_visit(self): origin_foobar_baz = {"url": "http://foobar/baz"} self.search.origin_update( [{**o, "has_visits": True} for o in [origin_foobar_baz]] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] def test_origin_with_visit_added(self): origin_foobar_baz = {"url": "http://foobar.baz"} self.search.origin_update([origin_foobar_baz]) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [] self.search.origin_update( [{**o, "has_visits": True} for o in [origin_foobar_baz]] ) self.search.flush() actual_page = self.search.origin_search(url_pattern="foobar", with_visit=True) assert actual_page.next_page_token is None assert actual_page.results == [origin_foobar_baz] def test_origin_intrinsic_metadata_description(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} self.search.origin_update( [ {**origin1_nothin, "intrinsic_metadata": {},}, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar", }, }, { **origin3_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "bar baz", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="foo bar") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] def test_origin_intrinsic_metadata_all_terms(self): origin1_foobarfoobar = {"url": "http://origin1"} origin3_foobarbaz = {"url": "http://origin2"} self.search.origin_update( [ { **origin1_foobarfoobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar foo bar", }, }, { **origin3_foobarbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "description": "foo bar baz", }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_foobarbaz] + def test_origin_intrinsic_metadata_matches_cross_fields(self): + """Checks the backend finds results even if the two words in the query are + each in a different field.""" + origin1 = {"url": "http://origin1"} + + self.search.origin_update( + [ + { + **origin1, + "intrinsic_metadata": { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "description": "foo bar", + "author": "John Doe", + }, + }, + ] + ) + self.search.flush() + + actual_page = self.search.origin_search(metadata_pattern="foo John") + assert actual_page.next_page_token is None + assert actual_page.results == [origin1] + def test_origin_intrinsic_metadata_nested(self): origin1_nothin = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_barbaz = {"url": "http://origin3"} self.search.origin_update( [ {**origin1_nothin, "intrinsic_metadata": {},}, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["bar", "baz"], }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="foo bar") assert actual_page.next_page_token is None assert actual_page.results == [origin2_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin3_barbaz] def test_origin_intrinsic_metadata_inconsistent_type(self): """Checks the same field can have a concrete value, an object, or an array in different documents.""" origin1_foobar = {"url": "http://origin1"} origin2_barbaz = {"url": "http://origin2"} origin3_bazqux = {"url": "http://origin3"} self.search.origin_update( [ { **origin1_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": {"familyName": "Foo", "givenName": "Bar",}, }, }, ] ) self.search.flush() self.search.origin_update( [ { **origin2_barbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": "Bar Baz", }, }, { **origin3_bazqux, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "author": ["Baz", "Qux"], }, }, ] ) self.search.flush() actual_page = self.search.origin_search(metadata_pattern="bar") assert actual_page.next_page_token is None assert actual_page.results == [origin2_barbaz, origin1_foobar] actual_page = self.search.origin_search(metadata_pattern="baz") assert actual_page.next_page_token is None assert actual_page.results == [origin2_barbaz, origin3_bazqux] actual_page = self.search.origin_search(metadata_pattern="foo") assert actual_page.next_page_token is None assert actual_page.results == [origin1_foobar] actual_page = self.search.origin_search(metadata_pattern="bar baz") assert actual_page.next_page_token is None assert actual_page.results == [origin2_barbaz] actual_page = self.search.origin_search(metadata_pattern="qux") assert actual_page.next_page_token is None assert actual_page.results == [origin3_bazqux] actual_page = self.search.origin_search(metadata_pattern="baz qux") assert actual_page.next_page_token is None assert actual_page.results == [origin3_bazqux] - # FIXME: the following won't work because "foo" and "bar" are not in the - # same field. - # actual_page = self.search.origin_search(metadata_pattern="foo bar") - # assert actual_page.next_page_token is None - # assert actual_page.results == [origin2_foobar] + actual_page = self.search.origin_search(metadata_pattern="foo bar") + assert actual_page.next_page_token is None + assert actual_page.results == [origin1_foobar] # TODO: add more tests with more codemeta terms # TODO: add more tests with edge cases @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_url_paging(self, limit): # TODO: no hypothesis origin1_foo = {"url": "http://origin1/foo"} origin2_foobar = {"url": "http://origin2/foo/bar"} origin3_foobarbaz = {"url": "http://origin3/foo/bar/baz"} self.reset() self.search.origin_update([origin1_foo, origin2_foobar, origin3_foobarbaz]) self.search.flush() results = stream_results( self.search.origin_search, url_pattern="foo bar baz", limit=limit ) results = [res["url"] for res in results] expected_results = [o["url"] for o in [origin3_foobarbaz]] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern="foo bar", limit=limit ) results = [res["url"] for res in results] expected_results = [o["url"] for o in [origin2_foobar, origin3_foobarbaz]] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern="foo", limit=limit ) results = [res["url"] for res in results] expected_results = [ o["url"] for o in [origin1_foo, origin2_foobar, origin3_foobarbaz] ] assert sorted(results[0 : len(expected_results)]) == sorted(expected_results) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_intrinsic_metadata_paging(self, limit): # TODO: no hypothesis origin1_foo = {"url": "http://origin1"} origin2_foobar = {"url": "http://origin2"} origin3_foobarbaz = {"url": "http://origin3"} self.reset() self.search.origin_update( [ { **origin1_foo, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo"], }, }, { **origin2_foobar, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar"], }, }, { **origin3_foobarbaz, "intrinsic_metadata": { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "keywords": ["foo", "bar", "baz"], }, }, ] ) self.search.flush() results = stream_results( self.search.origin_search, metadata_pattern="foo bar baz", limit=limit ) assert list(results) == [origin3_foobarbaz] results = stream_results( self.search.origin_search, metadata_pattern="foo bar", limit=limit ) assert list(results) == [origin2_foobar, origin3_foobarbaz] results = stream_results( self.search.origin_search, metadata_pattern="foo", limit=limit ) assert list(results) == [origin1_foo, origin2_foobar, origin3_foobarbaz]