diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -4,3 +4,4 @@ swh.indexer >= 0.0.120 swh.scheduler >= 0.0.31 swh.core >= 0.0.81 +swh.search >= 0.0.3 diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -194,7 +194,7 @@ and only the Link header should be used for paginating through results. - :param string url_pattern: a string pattern or a regular expression + :param string url_pattern: a string pattern :query int limit: the maximum number of found origins to return (bounded to 1000) :query boolean regexp: if true, consider provided pattern as a regular @@ -219,21 +219,19 @@ :swh_web_api:`origin/search/python/?limit=2` """ result = {} - offset = int(request.query_params.get('offset', '0')) limit = min(int(request.query_params.get('limit', '70')), 1000) - regexp = request.query_params.get('regexp', 'false') + page_token = request.query_params.get('page_token') with_visit = request.query_params.get('with_visit', 'false') - results = api_lookup(service.search_origin, url_pattern, offset, limit, - bool(strtobool(regexp)), bool(strtobool(with_visit)), - enrich_fn=_enrich_origin) + (results, page_token) = api_lookup( + service.search_origin, url_pattern, limit, + page_token, bool(strtobool(with_visit)), + enrich_fn=_enrich_origin) - nb_results = len(results) - if nb_results == limit: + if page_token is not None: query_params = {} - query_params['offset'] = offset + limit query_params['limit'] = limit - query_params['regexp'] = regexp + query_params['page_token'] = page_token result['headers'] = { 'link-next': reverse('api-1-origin-search', @@ -242,7 +240,7 @@ } result.update({ - 'results': results + 'results': list(results) }) return result diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -3,9 +3,7 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -import itertools import os -import re from collections import defaultdict from typing import Any, Dict @@ -23,6 +21,7 @@ from swh.web.common.origin_visits import get_origin_visit from swh.web import config +search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() @@ -247,8 +246,7 @@ return map(converters.from_origin, origins) -def search_origin(url_pattern, offset=0, limit=50, regexp=False, - with_visit=False): +def search_origin(url_pattern, limit=50, page_token=None, with_visit=False): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. @@ -261,21 +259,11 @@ list of origin information as dict. """ - if not regexp: - # If the query is not a regexp, rewrite it as a regexp. - regexp = True - search_words = [re.escape(word) for word in url_pattern.split()] - if len(search_words) >= 7: - url_pattern = '.*'.join(search_words) - else: - pattern_parts = [] - for permut in itertools.permutations(search_words): - pattern_parts.append('.*'.join(permut)) - url_pattern = '|'.join(pattern_parts) - - origins = storage.origin_search(url_pattern, offset, limit, regexp, - with_visit) - return map(converters.from_origin, origins) + results = search.origin_search(url_pattern=url_pattern, count=limit, + page_token=page_token, + with_visit=with_visit) + origins = map(converters.from_origin, results['results']) + return (origins, results['next_page_token']) def search_origin_metadata(fulltext, limit=50): diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -10,6 +10,7 @@ from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler +from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings @@ -30,6 +31,13 @@ 'timeout': 1, } }), + 'search': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://127.0.0.1:5010/', + 'timeout': 10, + }, + }), 'log_dir': ('string', '/tmp/swh/log'), 'debug': ('bool', False), 'serve_assets': ('bool', False), @@ -127,6 +135,7 @@ cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') + swhweb_config['search'] = get_search(**swhweb_config['search']) swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ @@ -136,6 +145,13 @@ return swhweb_config +def search(): + """Return the current application's search. + + """ + return get_config()['search'] + + def storage(): """Return the current application's storage. diff --git a/swh/web/settings/tests.py b/swh/web/settings/tests.py --- a/swh/web/settings/tests.py +++ b/swh/web/settings/tests.py @@ -101,7 +101,8 @@ }) from swh.web.tests.data import get_tests_data, override_storages # noqa test_data = get_tests_data() - override_storages(test_data['storage'], test_data['idx_storage']) + override_storages(test_data['storage'], test_data['idx_storage'], + test_data['search']) else: ALLOWED_HOSTS += ['testserver'] diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -497,22 +497,6 @@ == {'https://github.com/memononen/libtess2'} -def test_api_origin_search_regexp(api_client): - expected_origins = { - 'https://github.com/memononen/libtess2', - 'repo_with_submodules' - } - - url = reverse('api-1-origin-search', - url_args={'url_pattern': '(repo|libtess)'}, - query_params={'limit': 10, - 'regexp': True}) - rv = api_client.get(url) - assert rv.status_code == 200, rv.data - assert rv['Content-Type'] == 'application/json' - assert {origin['url'] for origin in rv.data} == expected_origins - - @pytest.mark.parametrize('limit', [1, 2, 3, 10]) def test_api_origin_search_scroll(api_client, archive_data, limit): expected_origins = { @@ -529,8 +513,8 @@ assert {origin['url'] for origin in results} == expected_origins -def test_api_origin_search_limit(api_client, archive_data): - archive_data.origin_add([ +def test_api_origin_search_limit(api_client, archive_data, tests_data): + tests_data['search'].origin_update([ {'url': 'http://foobar/{}'.format(i)} for i in range(2000) ]) diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -95,7 +95,8 @@ data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module - override_storages(data['storage'], data['idx_storage']) + override_storages(data['storage'], data['idx_storage'], + data['search']) return data diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -20,6 +20,7 @@ from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS from swh.model.identifiers import directory_identifier from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.search import get_search from swh.storage.algos.dir_iterators import dir_iterator from swh.web import config from swh.web.browse.utils import ( @@ -171,6 +172,11 @@ # To hold reference to the memory storage storage = None + # Create search instance + search = get_search('memory', {}) + search.initialize() + search.origin_update({'url': origin['url']} for origin in _TEST_ORIGINS) + # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive in enumerate(origin['archives']): @@ -188,10 +194,12 @@ loader.load() origin.update(storage.origin_get(origin)) # add an 'id' key if enabled + search.origin_update([{'url': origin['url'], 'has_visits': True}]) for i in range(250): url = 'https://many.origins/%d' % (i+1) storage.origin_add([{'url': url}]) + search.origin_update([{'url': url, 'has_visits': True}]) visit = storage.origin_visit_add(url, '2019-12-03 13:55:05', 'tar') storage.origin_visit_update( url, visit['visit'], @@ -264,6 +272,7 @@ return { 'storage': storage, 'idx_storage': idx_storage, + 'search': search, 'origins': _TEST_ORIGINS, 'contents': contents, 'directories': list(directories), @@ -322,17 +331,21 @@ return _current_tests_data -def override_storages(storage, idx_storage): +def override_storages(storage, idx_storage, search): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() - swh_config.update({'storage': storage}) - service.storage = storage + swh_config.update({ + 'storage': storage, + 'indexer_storage': idx_storage, + 'search': search, + }) - swh_config.update({'indexer_storage': idx_storage}) + service.storage = storage service.idx_storage = idx_storage + service.search = search # Implement some special endpoints used to provide input tests data