diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index 6a1d70a..fd6feb4 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,229 +1,231 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from typing import Any, Iterable, Dict, List, Iterator, Optional from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk, scan import msgpack from swh.core.api import remote_api_endpoint from swh.model import model from swh.model.identifiers import origin_identifier def _sanitize_origin(origin): origin = origin.copy() res = { 'url': origin.pop('url') } for field_name in ('intrinsic_metadata', 'has_visits'): if field_name in origin: res[field_name] = origin.pop(field_name) return res class ElasticSearch: def __init__(self, hosts: List[str]): self._backend = Elasticsearch(hosts=hosts) @remote_api_endpoint('check') def check(self): return self._backend.ping() def deinitialize(self) -> None: """Removes all indices from the Elasticsearch backend""" self._backend.indices.delete(index='*') def initialize(self) -> None: """Declare Elasticsearch indices and mappings""" if not self._backend.indices.exists(index='origin'): self._backend.indices.create(index='origin') self._backend.indices.put_mapping( index='origin', body={ 'properties': { 'sha1': { 'type': 'keyword', 'doc_values': True, }, 'url': { 'type': 'text', # To split URLs into token on any character # that is not alphanumerical 'analyzer': 'simple', 'fields': { 'as_you_type': { 'type': 'search_as_you_type', 'analyzer': 'simple', } } }, 'has_visits': { 'type': 'boolean', }, 'intrinsic_metadata': { 'type': 'nested', 'properties': { '@context': { # don't bother indexing tokens 'type': 'keyword', } }, }, } } ) @remote_api_endpoint('flush') def flush(self) -> None: """Blocks until all previous calls to _update() are completely applied.""" self._backend.indices.refresh(index='_all') @remote_api_endpoint('origin/update') def origin_update(self, documents: Iterable[dict]) -> None: documents = map(_sanitize_origin, documents) documents_with_sha1 = ((origin_identifier(document), document) for document in documents) actions = [ { '_op_type': 'update', '_id': sha1, '_index': 'origin', 'doc': { **document, 'sha1': sha1, }, 'doc_as_upsert': True, } for (sha1, document) in documents_with_sha1 ] bulk(self._backend, actions, index='origin') def origin_dump(self) -> Iterator[model.Origin]: """Returns all content in Elasticsearch's index. Not exposed publicly; but useful for tests.""" results = scan(self._backend, index='*') for hit in results: yield self._backend.termvectors( index='origin', id=hit['_id'], fields=['*']) @remote_api_endpoint('origin/search') def origin_search( self, *, url_pattern: str = None, metadata_pattern: str = None, with_visit: bool = False, page_token: str = None, count: int = 50 ) -> Dict[str, object]: """Searches for origins matching the `url_pattern`. Args: url_pattern (str): Part of thr URL to search for with_visit (bool): Whether origins with no visit are to be filtered out page_token (str): Opaque value used for pagination. count (int): number of results to return. Returns: a dictionary with keys: * `next_page_token`: opaque value used for fetching more results. `None` if there are no more result. * `results`: list of dictionaries with key: * `url`: URL of a matching origin """ query_clauses = [] # type: List[Dict[str, Any]] if url_pattern: query_clauses.append({ 'multi_match': { 'query': url_pattern, 'type': 'bool_prefix', + 'operator': 'and', 'fields': [ 'url.as_you_type', 'url.as_you_type._2gram', 'url.as_you_type._3gram', ] } }) if metadata_pattern: query_clauses.append({ 'nested': { 'path': 'intrinsic_metadata', 'query': { 'multi_match': { 'query': metadata_pattern, + 'operator': 'and', 'fields': ['intrinsic_metadata.*'] } }, } }) if not query_clauses: raise ValueError( 'At least one of url_pattern and metadata_pattern ' 'must be provided.') if with_visit: query_clauses.append({ 'term': { 'has_visits': True, } }) body = { 'query': { 'bool': { 'must': query_clauses, } }, 'size': count, 'sort': [ {'_score': 'desc'}, {'sha1': 'asc'}, ] } if page_token: # TODO: use ElasticSearch's scroll API? page_token_content = msgpack.loads( base64.b64decode(page_token)) body['search_after'] = \ [page_token_content[b'score'], page_token_content[b'sha1'].decode('ascii')] res = self._backend.search( index='origin', body=body, size=count, ) hits = res['hits']['hits'] if len(hits) == count: last_hit = hits[-1] next_page_token_content = { b'score': last_hit['_score'], b'sha1': last_hit['_source']['sha1'], } next_page_token = base64.b64encode(msgpack.dumps( next_page_token_content)) # type: Optional[bytes] else: next_page_token = None return { 'next_page_token': next_page_token, 'results': [ { # TODO: also add 'id'? 'url': hit['_source']['url'], } for hit in hits ] } diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py index 8ca9e6d..57312bb 100644 --- a/swh/search/tests/test_in_memory.py +++ b/swh/search/tests/test_in_memory.py @@ -1,36 +1,40 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.search import get_search from .test_search import CommonSearchTest class InmemorySearchTest(unittest.TestCase, CommonSearchTest): @pytest.fixture(autouse=True) def _instantiate_search(self): self.search = get_search('memory', {}) def setUp(self): self.reset() def reset(self): self.search.deinitialize() self.search.initialize() @pytest.mark.skip('Not implemented in the in-memory search') def test_origin_intrinsic_metadata_description(self): pass + @pytest.mark.skip('Not implemented in the in-memory search') + def test_origin_intrinsic_metadata_all_terms(self): + pass + @pytest.mark.skip('Not implemented in the in-memory search') def test_origin_intrinsic_metadata_nested(self): pass @pytest.mark.skip('Not implemented in the in-memory search') def test_origin_intrinsic_metadata_paging(self): pass diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 84b33b8..0105777 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,262 +1,294 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given, strategies, settings from swh.search.utils import stream_results class CommonSearchTest: def test_origin_url_unique_word_prefix(self): self.search.origin_update([ {'url': 'http://foobar.baz'}, {'url': 'http://barbaz.qux'}, {'url': 'http://qux.quux'}, ]) self.search.flush() results = self.search.origin_search(url_pattern='foobar') assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} results = self.search.origin_search(url_pattern='barb') assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} # 'bar' is part of 'foobar', but is not the beginning of it results = self.search.origin_search(url_pattern='bar') assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} results = self.search.origin_search(url_pattern='barbaz') assert results == {'next_page_token': None, 'results': [ {'url': 'http://barbaz.qux'}]} def test_origin_url_unique_word_prefix_multiple_results(self): self.search.origin_update([ {'url': 'http://foobar.baz'}, {'url': 'http://barbaz.qux'}, {'url': 'http://qux.quux'}, ]) self.search.flush() results = self.search.origin_search(url_pattern='qu') assert results['next_page_token'] is None results = [res['url'] for res in results['results']] expected_results = ['http://qux.quux', 'http://barbaz.qux'] assert sorted(results) == sorted(expected_results) results = self.search.origin_search(url_pattern='qux') assert results['next_page_token'] is None results = [res['url'] for res in results['results']] expected_results = ['http://barbaz.qux', 'http://qux.quux'] assert sorted(results) == sorted(expected_results) + def test_origin_url_all_terms(self): + self.search.origin_update([ + {'url': 'http://foo.bar/baz'}, + {'url': 'http://foo.bar/foo.bar'}, + ]) + self.search.flush() + + # Only results containing all terms should be returned. + results = self.search.origin_search(url_pattern='foo bar baz') + assert results == {'next_page_token': None, 'results': [ + {'url': 'http://foo.bar/baz'}, + ]} + def test_origin_with_visit(self): self.search.origin_update([ {'url': 'http://foobar.baz', 'has_visits': True}, ]) self.search.flush() results = self.search.origin_search( url_pattern='foobar', with_visit=True) assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} def test_origin_with_visit_added(self): self.search.origin_update([ {'url': 'http://foobar.baz'}, ]) self.search.flush() results = self.search.origin_search( url_pattern='foobar', with_visit=True) assert results == {'next_page_token': None, 'results': []} self.search.origin_update([ {'url': 'http://foobar.baz', 'has_visits': True}, ]) self.search.flush() results = self.search.origin_search( url_pattern='foobar', with_visit=True) assert results == {'next_page_token': None, 'results': [ {'url': 'http://foobar.baz'}]} def test_origin_intrinsic_metadata_description(self): self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': {}, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'foo bar', }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'bar baz', } }, ]) self.search.flush() results = self.search.origin_search(metadata_pattern='foo') assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}]} - # ES returns both results, because blahblah results = self.search.origin_search(metadata_pattern='foo bar') assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}, {'url': 'http://origin3'}]} + {'url': 'http://origin2'}]} results = self.search.origin_search(metadata_pattern='bar baz') assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin3'}, {'url': 'http://origin2'}]} + {'url': 'http://origin3'}]} + + def test_origin_intrinsic_metadata_all_terms(self): + self.search.origin_update([ + { + 'url': 'http://origin1', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'foo bar foo bar', + }, + }, + { + 'url': 'http://origin3', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'foo bar baz', + } + }, + ]) + self.search.flush() + + results = self.search.origin_search(metadata_pattern='foo bar baz') + assert results == {'next_page_token': None, 'results': [ + {'url': 'http://origin3'}]} def test_origin_intrinsic_metadata_nested(self): self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': {}, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar'], }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['bar', 'baz'], } }, ]) self.search.flush() results = self.search.origin_search(metadata_pattern='foo') assert results == {'next_page_token': None, 'results': [ {'url': 'http://origin2'}]} results = self.search.origin_search(metadata_pattern='foo bar') assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin2'}, {'url': 'http://origin3'}]} + {'url': 'http://origin2'}]} results = self.search.origin_search(metadata_pattern='bar baz') assert results == {'next_page_token': None, 'results': [ - {'url': 'http://origin3'}, {'url': 'http://origin2'}]} + {'url': 'http://origin3'}]} # TODO: add more tests with more codemeta terms # TODO: add more tests with edge cases @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_url_paging(self, count): # TODO: no hypothesis self.reset() self.search.origin_update([ {'url': 'http://origin1/foo'}, {'url': 'http://origin2/foo/bar'}, {'url': 'http://origin3/foo/bar/baz'}, ]) self.search.flush() results = stream_results( self.search.origin_search, url_pattern='foo bar baz', count=count) results = [res['url'] for res in results] expected_results = [ 'http://origin3/foo/bar/baz', ] assert sorted(results[0:len(expected_results)]) == \ sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern='foo bar', count=count) expected_results = [ 'http://origin2/foo/bar', 'http://origin3/foo/bar/baz', ] results = [res['url'] for res in results] assert sorted(results[0:len(expected_results)]) == \ sorted(expected_results) results = stream_results( self.search.origin_search, url_pattern='foo', count=count) expected_results = [ 'http://origin1/foo', 'http://origin2/foo/bar', 'http://origin3/foo/bar/baz', ] results = [res['url'] for res in results] assert sorted(results[0:len(expected_results)]) == \ sorted(expected_results) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) def test_origin_intrinsic_metadata_paging(self, count): # TODO: no hypothesis self.reset() self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo'], }, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar'], }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar', 'baz'], } }, ]) self.search.flush() results = stream_results( self.search.origin_search, metadata_pattern='foo bar baz', count=count) assert list(results) == [ - {'url': 'http://origin3'}, - {'url': 'http://origin2'}, - {'url': 'http://origin1'}] + {'url': 'http://origin3'}] results = stream_results( self.search.origin_search, metadata_pattern='foo bar', count=count) assert list(results) == [ {'url': 'http://origin2'}, - {'url': 'http://origin3'}, - {'url': 'http://origin1'}] + {'url': 'http://origin3'}] results = stream_results( self.search.origin_search, metadata_pattern='foo', count=count) assert list(results) == [ {'url': 'http://origin1'}, {'url': 'http://origin2'}, {'url': 'http://origin3'}]