diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py index cf34c72..9706006 100644 --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -1,145 +1,191 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 from typing import Iterable, Dict, List, Iterator from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.helpers import bulk, scan import msgpack from swh.model import model from swh.model.identifiers import origin_identifier +def _sanitize_origin(origin): + origin = origin.copy() + res = { + 'url': origin.pop('url') + } + for field_name in ('type', 'intrinsic_metadata'): + if field_name in origin: + res[field_name] = origin.pop(field_name) + return res + + class ElasticSearch: def __init__(self, hosts: List[str]): self._backend = Elasticsearch(hosts=hosts) def check(self): self._backend.ping() + def deinitialize(self) -> None: + self._backend.indices.delete(index='*') + def initialize(self) -> None: self._backend.indices.create( index='origin', body={ 'mappings': { 'properties': { 'url': { 'type': 'text', # TODO: consider removing fielddata when # swh-storage allows querying by hash, so the # full URL does not have to be stored in ES' # memory. See: # https://www.elastic.co/guide/en/elasticsearch/reference/current/fielddata.html#before-enabling-fielddata 'fielddata': True, 'analyzer': 'simple', 'fields': { 'as_you_type': { 'type': 'search_as_you_type', 'analyzer': 'simple', } } - } + }, + 'intrinsic_metadata': { + 'type': 'nested', + 'properties': { + '@context': { + # don't bother indexing substrings + 'type': 'keyword', + } + }, + }, } } } ) - def origin_add(self, origins: Iterable[model.Origin]) -> None: - origins = (origin.to_dict() for origin in origins) - ''' - for origin in origins: - self._backend.index( - index='origin', - id=origin_identifier(origin), - body=origin, - ) - self._backend.indices.refresh(index='origin') - ''' + + def origin_update(self, documents: Iterable[dict]) -> None: + documents = map(_sanitize_origin, documents) actions = [ { - '_id': origin_identifier(origin), + '_op_type': 'update', + '_id': origin_identifier(document), '_index': 'origin', - '_source': origin, + 'doc': document, + 'doc_as_upsert': True, } - for origin in origins + for document in documents ] res = bulk(self._backend, actions, index='origin', refresh='wait_for') def origin_dump(self) -> Iterator[model.Origin]: results = list(scan(self._backend, index='*')) for hit in results: yield self._backend.termvectors( index='origin', id=hit['_id'], - fields=['url', 'url.as_you_type', 'url.as_you_type._2gram' - 'url.as_you_type._3gram', 'url._2gram', 'url._3gram']) + fields=['*']) def origin_search( - self, url_substring: str, cursor: str = None, count: int = 50 + self, *, + url_substring: str = None, metadata_substring: str = None, + cursor: str = None, count: int = 50 ) -> Dict[str, object]: """Searches for origins matching the `url_substring`. Args: url_substring (str): Part of thr URL to search for cursor (str): `cursor` is opaque value used for pagination. count (int): number of results to return. Returns: a dictionary with keys: * `cursor`: opaque value used for fetching more results. `None` if there are no more result. * `results`: list of dictionaries with key: * `url`: URL of a matching origin """ - body = { - 'query': { + query_clauses = [] + + if url_substring: + query_clauses.append({ 'multi_match': { 'query': url_substring, 'type': 'bool_prefix', 'fields': [ 'url.as_you_type', 'url.as_you_type._2gram', 'url.as_you_type._3gram', ] } + }) + + if metadata_substring: + query_clauses.append({ + 'nested': { + 'path': 'intrinsic_metadata', + 'query': { + 'multi_match': { + 'query': metadata_substring, + 'fields': ['intrinsic_metadata.*'] + } + }, + } + }) + + if not query_clauses: + raise ValueError( + 'At least one of url_substring and metadata_substring ' + 'must be provided.') + + body = { + 'query': { + 'bool': { + 'should': query_clauses, + } }, 'size': count, 'sort': [ {'_score': 'desc'}, - {'url': 'asc'}, + {'_id': 'asc'}, ] } if cursor: - cursor = msgpack.decode(base64.b64decode(cursor)) - body['search_after'] = [cursor['_score'], cursor['url']] + cursor = msgpack.loads(base64.b64decode(cursor)) + body['search_after'] = [cursor[b'score'], cursor[b'id']] res = self._backend.search( index='origin', body=body, size=count, ) hits = res['hits']['hits'] if len(hits) == count: last_hit = hits[-1] next_cursor = { - 'score': last_hit['_score'], - 'url': last_hit['_source']['url'], + b'score': last_hit['_score'], + b'id': last_hit['_id'], } next_cursor = base64.b64encode(msgpack.dumps(next_cursor)) else: next_cursor = None return { 'cursor': next_cursor, 'results': [ - {'url': hit['_source']['url'] for hit in hits} + {'url': hit['_source']['url']} + for hit in hits ] } diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py index 610c44b..40fe72e 100644 --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -1,39 +1,171 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest +from hypothesis import given, strategies, settings import pytest from swh.model.model import Origin + from swh.search.elasticsearch import ElasticSearch +from swh.search.utils import stream_results def test_origin_url_unique_word_prefix(elasticsearch_host): search = ElasticSearch([elasticsearch_host]) search.initialize() - search.origin_add([ - Origin(url='http://foobar.baz', type=None), - Origin(url='http://barbaz.qux', type=None), + search.origin_update([ + {'url': 'http://foobar.baz'}, + {'url': 'http://barbaz.qux'}, + {'url': 'http://qux.quux'}, ]) - results = search.origin_search('foobar') + results = search.origin_search(url_substring='foobar') assert results == {'cursor': None, 'results': [{'url': 'http://foobar.baz'}]} - results = search.origin_search('barb') + results = search.origin_search(url_substring='barb') assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} # 'bar' is part of 'foobar', but is not the beginning of it - results = search.origin_search('bar') + results = search.origin_search(url_substring='bar') assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} - results = search.origin_search('barbaz') + results = search.origin_search(url_substring='barbaz') assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} - results = search.origin_search('qu') - assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} + results = search.origin_search(url_substring='qu') + assert results['cursor'] == None + assert sorted(res['url'] for res in results['results']) \ + == ['http://barbaz.qux', 'http://qux.quux'] - results = search.origin_search('qux') - assert results == {'cursor': None, 'results': [{'url': 'http://barbaz.qux'}]} + results = search.origin_search(url_substring='qux') + assert results['cursor'] == None + assert sorted(res['url'] for res in results['results']) \ + == ['http://barbaz.qux', 'http://qux.quux'] + + +def test_origin_intrinsic_metadata_description(elasticsearch_host): + search = ElasticSearch([elasticsearch_host]) + search.initialize() + search.origin_update([ + { + 'url': 'http://origin1', + 'intrinsic_metadata': {}, + }, + { + 'url': 'http://origin2', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'foo bar', + }, + }, + { + 'url': 'http://origin3', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'description': 'bar baz', + } + }, + ]) + + results = search.origin_search(metadata_substring='foo') + assert results == {'cursor': None, 'results': [{'url': 'http://origin2'}]} + + results = search.origin_search(metadata_substring='foo bar') + assert results == {'cursor': None, 'results': [ + {'url': 'http://origin2'}, {'url': 'http://origin3'}]} + + results = search.origin_search(metadata_substring='bar baz') + assert results == {'cursor': None, 'results': [ + {'url': 'http://origin3'}, {'url': 'http://origin2'}]} + + +def test_origin_intrinsic_metadata_keywords(elasticsearch_host): + search = ElasticSearch([elasticsearch_host]) + search.initialize() + search.origin_update([ + { + 'url': 'http://origin1', + 'intrinsic_metadata': {}, + }, + { + 'url': 'http://origin2', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'keywords': ['foo', 'bar'], + }, + }, + { + 'url': 'http://origin3', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'keywords': ['bar', 'baz'], + } + }, + ]) + + results = search.origin_search(metadata_substring='foo') + assert results == {'cursor': None, 'results': [{'url': 'http://origin2'}]} + + results = search.origin_search(metadata_substring='foo bar') + assert results == {'cursor': None, 'results': [ + {'url': 'http://origin2'}, {'url': 'http://origin3'}]} + + results = search.origin_search(metadata_substring='bar baz') + assert results == {'cursor': None, 'results': [ + {'url': 'http://origin3'}, {'url': 'http://origin2'}]} + + +@settings(deadline=None) +@given(strategies.integers(min_value=1, max_value=4)) +def test_origin_paging(elasticsearch_host, count): + search = ElasticSearch([elasticsearch_host]) + search.deinitialize() + search.initialize() + search.origin_update([ + { + 'url': 'http://origin1', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'keywords': ['foo'], + }, + }, + { + 'url': 'http://origin2', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'keywords': ['foo', 'bar'], + }, + }, + { + 'url': 'http://origin3', + 'intrinsic_metadata': { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'keywords': ['foo', 'bar', 'baz'], + } + }, + ]) + + results = stream_results( + search.origin_search, metadata_substring='foo bar baz', count=count) + assert list(results) == [ + {'url': 'http://origin3'}, + {'url': 'http://origin2'}, + {'url': 'http://origin1'}] + + results = stream_results( + search.origin_search, metadata_substring='foo bar', count=count) + assert list(results) == [ + {'url': 'http://origin2'}, + {'url': 'http://origin3'}, + {'url': 'http://origin1'}] + + results = stream_results( + search.origin_search, metadata_substring='foo', count=count) + assert list(results) == [ + {'url': 'http://origin1'}, + {'url': 'http://origin2'}, + {'url': 'http://origin3'}]