diff --git a/swh/search/__init__.py b/swh/search/__init__.py index f7a63d0..d63010d 100644 --- a/swh/search/__init__.py +++ b/swh/search/__init__.py @@ -1,24 +1,26 @@ def get_search(cls, args): """Get an search object of class `search_class` with arguments `search_args`. Args: cls (str): search's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the search class constructor Returns: an instance of swh.search's classes (either local or remote) Raises: ValueError if passed an unknown search class. """ if cls == 'remote': from .api.client import RemoteSearch as Search elif cls == 'elasticsearch': from .elasticsearch import ElasticSearch as Search + elif cls == 'memory': + from .in_memory import InmemorySearchTest as Search else: raise ValueError('Unknown indexer search class `%s`' % cls) return Search(**args) diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py new file mode 100644 index 0000000..4cc2c00 --- /dev/null +++ b/swh/search/in_memory.py @@ -0,0 +1,117 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import base64 +from collections import defaultdict +import itertools +import re +from typing import Iterable, Dict + +import msgpack + +from swh.core.api import remote_api_endpoint +from swh.model.identifiers import origin_identifier + + +def _sanitize_origin(origin): + origin = origin.copy() + res = { + 'url': origin.pop('url') + } + for field_name in ('type', 'intrinsic_metadata'): + if field_name in origin: + res[field_name] = origin.pop(field_name) + return res + + +class InMemorySearch: + def __init__(self): + pass + + @remote_api_endpoint('check') + def check(self): + return True + + def deinitialize(self) -> None: + if hasattr(self, '_origins'): + del self._origins + del self._origin_ids + + def initialize(self) -> None: + self._origins = defaultdict(dict) + self._origin_ids = [] + + _url_splitter = re.compile(r'\W') + + @remote_api_endpoint('origin/update') + def origin_update(self, documents: Iterable[dict]) -> None: + for document in documents: + document = document.copy() + id_ = origin_identifier(document) + if 'url' in document: + document['_url_tokens'] = \ + set(self._url_splitter.split(document['url'])) + self._origins[id_].update(document) + if id_ not in self._origin_ids: + self._origin_ids.append(id_) + + @remote_api_endpoint('origin/search') + def origin_search( + self, *, + url_substring: str = None, metadata_substring: str = None, + cursor: str = None, count: int = 50 + ) -> Dict[str, object]: + matches = (self._origins[id_] for id_ in self._origin_ids) + + if url_substring: + tokens = set(self._url_splitter.split(url_substring)) + + def predicate(match): + missing_tokens = tokens - match['_url_tokens'] + if len(missing_tokens) == 0: + return True + elif len(missing_tokens) > 1: + return False + else: + # There is one missing token, look up by prefix. + (missing_token,) = missing_tokens + return any(token.startswith(missing_token) + for token in match['_url_tokens']) + + matches = filter(predicate, matches) + + if metadata_substring: + raise NotImplementedError( + 'Metadata search is not implemented in the in-memory backend.') + + if not url_substring and not metadata_substring: + raise ValueError( + 'At least one of url_substring and metadata_substring ' + 'must be provided.') + + if cursor: + cursor = msgpack.loads(base64.b64decode(cursor)) + start_at_index = cursor[b'start_at_index'] + else: + start_at_index = 0 + + hits = list(itertools.islice( + matches, start_at_index, start_at_index+count)) + + if len(hits) == count: + next_cursor = { + b'start_at_index': start_at_index+count, + } + next_cursor = base64.b64encode(msgpack.dumps(next_cursor)) + else: + next_cursor = None + + return { + 'cursor': next_cursor, + 'results': [ + {'url': hit['url']} + for hit in hits + ] + } diff --git a/swh/search/tests/test_elasticsearch.py b/swh/search/tests/test_elasticsearch.py index fb5dc8a..90a4bc4 100644 --- a/swh/search/tests/test_elasticsearch.py +++ b/swh/search/tests/test_elasticsearch.py @@ -1,29 +1,32 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import pytest from swh.search.elasticsearch import ElasticSearch from .test_search import CommonSearchTest class BaseElasticsearchTest(unittest.TestCase): @pytest.fixture(autouse=True) def _instantiate_search(self, elasticsearch_host): self._elasticsearch_host = elasticsearch_host self.search = ElasticSearch([elasticsearch_host]) def setUp(self): self.reset() def reset(self): self.search.deinitialize() self.search.initialize() class TestElasticsearchSearch(CommonSearchTest, BaseElasticsearchTest): - pass + @pytest.mark.skip('Elasticsearch also returns close matches, ' + 'so this test would fail') + def test_origin_url_paging(self, count): + pass diff --git a/swh/search/tests/test_in_memory.py b/swh/search/tests/test_in_memory.py new file mode 100644 index 0000000..3f2491a --- /dev/null +++ b/swh/search/tests/test_in_memory.py @@ -0,0 +1,36 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +import pytest + +from swh.search.in_memory import InMemorySearch +from .test_search import CommonSearchTest + + +class InmemorySearchTest(unittest.TestCase, CommonSearchTest): + @pytest.fixture(autouse=True) + def _instantiate_search(self): + self.search = InMemorySearch() + + def setUp(self): + self.reset() + + def reset(self): + self.search.deinitialize() + self.search.initialize() + + @pytest.mark.skip('Not implemented in the in-memory search') + def test_origin_intrinsic_metadata_description(self): + pass + + @pytest.mark.skip('Not implemented in the in-memory search') + def test_origin_intrinsic_metadata_keywords(self): + pass + + @pytest.mark.skip('Not implemented in the in-memory search') + def test_origin_intrinsic_metadata_paging(self): + pass diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 64bc16a..443d5af 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,166 +1,200 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given, strategies, settings from swh.search.utils import stream_results class CommonSearchTest: def test_origin_url_unique_word_prefix(self): self.search.origin_update([ {'url': 'http://foobar.baz'}, {'url': 'http://barbaz.qux'}, {'url': 'http://qux.quux'}, ]) results = self.search.origin_search(url_substring='foobar') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://foobar.baz'}]}) results = self.search.origin_search(url_substring='barb') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://barbaz.qux'}]}) # 'bar' is part of 'foobar', but is not the beginning of it results = self.search.origin_search(url_substring='bar') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://barbaz.qux'}]}) results = self.search.origin_search(url_substring='barbaz') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://barbaz.qux'}]}) results = self.search.origin_search(url_substring='qu') self.assertIsNone(results['cursor']) self.assertEqual( sorted(res['url'] for res in results['results']), ['http://barbaz.qux', 'http://qux.quux']) results = self.search.origin_search(url_substring='qux') self.assertIsNone(results['cursor']) self.assertEqual( sorted(res['url'] for res in results['results']), ['http://barbaz.qux', 'http://qux.quux']) def test_origin_intrinsic_metadata_description(self): self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': {}, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'foo bar', }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'description': 'bar baz', } }, ]) results = self.search.origin_search(metadata_substring='foo') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin2'}]}) results = self.search.origin_search(metadata_substring='foo bar') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin2'}, {'url': 'http://origin3'}]}) results = self.search.origin_search(metadata_substring='bar baz') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin3'}, {'url': 'http://origin2'}]}) def test_origin_intrinsic_metadata_keywords(self): self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': {}, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar'], }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['bar', 'baz'], } }, ]) results = self.search.origin_search(metadata_substring='foo') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin2'}]}) results = self.search.origin_search(metadata_substring='foo bar') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin2'}, {'url': 'http://origin3'}]}) results = self.search.origin_search(metadata_substring='bar baz') self.assertEqual(results, {'cursor': None, 'results': [ {'url': 'http://origin3'}, {'url': 'http://origin2'}]}) @settings(deadline=None) @given(strategies.integers(min_value=1, max_value=4)) - def test_origin_paging(self, count): + def test_origin_url_paging(self, count): + self.reset() + self.search.origin_update([ + {'url': 'http://origin1/foo'}, + {'url': 'http://origin2/foo/bar'}, + {'url': 'http://origin3/foo/bar/baz'}, + ]) + + results = list(stream_results( + self.search.origin_search, + url_substring='foo bar baz', count=count)) + expected_results = [ + {'url': 'http://origin3/foo/bar/baz'}] + self.assertEqual(results, expected_results) + + results = list(stream_results( + self.search.origin_search, + url_substring='foo bar', count=count)) + expected_results = [ + {'url': 'http://origin2/foo/bar'}, + {'url': 'http://origin3/foo/bar/baz'}] + self.assertEqual(results, expected_results) + + results = list(stream_results( + self.search.origin_search, + url_substring='foo', count=count)) + expected_results = [ + {'url': 'http://origin1/foo'}, + {'url': 'http://origin2/foo/bar'}, + {'url': 'http://origin3/foo/bar/baz'}] + self.assertEqual(results, expected_results) + + @settings(deadline=None) + @given(strategies.integers(min_value=1, max_value=4)) + def test_origin_intrinsic_metadata_paging(self, count): self.reset() self.search.origin_update([ { 'url': 'http://origin1', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo'], }, }, { 'url': 'http://origin2', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar'], }, }, { 'url': 'http://origin3', 'intrinsic_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'keywords': ['foo', 'bar', 'baz'], } }, ]) results = stream_results( self.search.origin_search, metadata_substring='foo bar baz', count=count) self.assertEqual(list(results), [ {'url': 'http://origin3'}, {'url': 'http://origin2'}, {'url': 'http://origin1'}]) results = stream_results( self.search.origin_search, metadata_substring='foo bar', count=count) self.assertEqual(list(results), [ {'url': 'http://origin2'}, {'url': 'http://origin3'}, {'url': 'http://origin1'}]) results = stream_results( self.search.origin_search, metadata_substring='foo', count=count) self.assertEqual(list(results), [ {'url': 'http://origin1'}, {'url': 'http://origin2'}, {'url': 'http://origin3'}])