diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -4,3 +4,4 @@ swh.indexer >= 0.0.120 swh.scheduler >= 0.0.31 swh.core >= 0.0.81 +swh.search >= 0.0.3 diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -238,7 +238,7 @@ } result.update({ - 'results': results + 'results': list(results) }) return result diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -3,9 +3,7 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -import itertools import os -import re from collections import defaultdict from typing import Any, Dict @@ -23,6 +21,7 @@ from swh.web.common.origin_visits import get_origin_visit from swh.web import config +search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() @@ -260,25 +259,11 @@ list of origin information as dict. """ - offset = int(page_token) if page_token else 0 - regexp = True - search_words = [re.escape(word) for word in url_pattern.split()] - if len(search_words) >= 7: - url_pattern = '.*'.join(search_words) - else: - pattern_parts = [] - for permut in itertools.permutations(search_words): - pattern_parts.append('.*'.join(permut)) - url_pattern = '|'.join(pattern_parts) - - origins = storage.origin_search(url_pattern, offset, limit, regexp, - with_visit) - origins = list(map(converters.from_origin, origins)) - if len(origins) >= limit: - page_token = str(offset + len(origins)) - else: - page_token = None - return (origins, page_token) + results = search.origin_search(url_pattern=url_pattern, count=limit, + page_token=page_token, + with_visit=with_visit) + origins = map(converters.from_origin, results['results']) + return (origins, results['next_page_token']) def search_origin_metadata(fulltext, limit=50): diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -10,6 +10,7 @@ from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler +from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings @@ -30,6 +31,13 @@ 'timeout': 1, } }), + 'search': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://127.0.0.1:5010/', + 'timeout': 10, + }, + }), 'log_dir': ('string', '/tmp/swh/log'), 'debug': ('bool', False), 'serve_assets': ('bool', False), @@ -128,6 +136,7 @@ cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') + swhweb_config['search'] = get_search(**swhweb_config['search']) swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ @@ -137,6 +146,13 @@ return swhweb_config +def search(): + """Return the current application's search. + + """ + return get_config()['search'] + + def storage(): """Return the current application's storage. diff --git a/swh/web/settings/tests.py b/swh/web/settings/tests.py --- a/swh/web/settings/tests.py +++ b/swh/web/settings/tests.py @@ -101,7 +101,8 @@ }) from swh.web.tests.data import get_tests_data, override_storages # noqa test_data = get_tests_data() - override_storages(test_data['storage'], test_data['idx_storage']) + override_storages(test_data['storage'], test_data['idx_storage'], + test_data['search']) else: ALLOWED_HOSTS += ['testserver'] diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -513,8 +513,8 @@ assert {origin['url'] for origin in results} == expected_origins -def test_api_origin_search_limit(api_client, archive_data): - archive_data.origin_add([ +def test_api_origin_search_limit(api_client, archive_data, tests_data): + tests_data['search'].origin_update([ {'url': 'http://foobar/{}'.format(i)} for i in range(2000) ]) diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -101,7 +101,8 @@ data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module - override_storages(data['storage'], data['idx_storage']) + override_storages(data['storage'], data['idx_storage'], + data['search']) return data diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -20,6 +20,7 @@ from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS from swh.model.identifiers import directory_identifier from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.search import get_search from swh.storage.algos.dir_iterators import dir_iterator from swh.web import config from swh.web.browse.utils import ( @@ -159,6 +160,11 @@ # To hold reference to the memory storage storage = None + # Create search instance + search = get_search('memory', {}) + search.initialize() + search.origin_update({'url': origin['url']} for origin in _TEST_ORIGINS) + # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive in enumerate(origin['archives']): @@ -176,10 +182,12 @@ loader.load() origin.update(storage.origin_get(origin)) # add an 'id' key if enabled + search.origin_update([{'url': origin['url'], 'has_visits': True}]) for i in range(250): url = 'https://many.origins/%d' % (i+1) storage.origin_add([{'url': url}]) + search.origin_update([{'url': url, 'has_visits': True}]) visit = storage.origin_visit_add(url, '2019-12-03 13:55:05', 'tar') storage.origin_visit_update( url, visit['visit'], @@ -252,6 +260,7 @@ return { 'storage': storage, 'idx_storage': idx_storage, + 'search': search, 'origins': _TEST_ORIGINS, 'contents': contents, 'directories': list(directories), @@ -310,17 +319,21 @@ return _current_tests_data -def override_storages(storage, idx_storage): +def override_storages(storage, idx_storage, search): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() - swh_config.update({'storage': storage}) - service.storage = storage + swh_config.update({ + 'storage': storage, + 'indexer_storage': idx_storage, + 'search': search, + }) - swh_config.update({'indexer_storage': idx_storage}) + service.storage = storage service.idx_storage = idx_storage + service.search = search # Implement some special endpoints used to provide input tests data