diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -3,3 +3,4 @@ swh.vault >= 0.0.23 swh.indexer >= 0.0.120 swh.scheduler >= 0.0.31 +swh.search >= 0.0.1 diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -193,7 +193,7 @@ and only the Link header should be used for paginating through results. - :param string url_pattern: a string pattern or a regular expression + :param string url_pattern: a string pattern :query int limit: the maximum number of found origins to return (bounded to 1000) :query boolean regexp: if true, consider provided pattern as a regular @@ -218,21 +218,19 @@ :swh_web_api:`origin/search/python/?limit=2` """ result = {} - offset = int(request.query_params.get('offset', '0')) limit = min(int(request.query_params.get('limit', '70')), 1000) - regexp = request.query_params.get('regexp', 'false') + scroll_token = request.query_params.get('scroll_token') with_visit = request.query_params.get('with_visit', 'false') - results = api_lookup(service.search_origin, url_pattern, offset, limit, - bool(strtobool(regexp)), bool(strtobool(with_visit)), - enrich_fn=_enrich_origin) + (results, scroll_token) = api_lookup( + service.search_origin, url_pattern, limit, + scroll_token, bool(strtobool(with_visit)), + enrich_fn=_enrich_origin) - nb_results = len(results) - if nb_results == limit: + if scroll_token is not None: query_params = {} - query_params['offset'] = offset + limit query_params['limit'] = limit - query_params['regexp'] = regexp + query_params['scroll_token'] = scroll_token result['headers'] = { 'link-next': reverse('api-1-origin-search', @@ -241,7 +239,7 @@ } result.update({ - 'results': results + 'results': list(results) }) return result diff --git a/swh/web/browse/views/origin.py b/swh/web/browse/views/origin.py --- a/swh/web/browse/views/origin.py +++ b/swh/web/browse/views/origin.py @@ -177,17 +177,15 @@ a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. """ - offset = int(request.GET.get('offset', '0')) + # TODO: scroll_token limit = min(int(request.GET.get('limit', '50')), 1000) - regexp = request.GET.get('regexp', 'false') with_visit = request.GET.get('with_visit', 'false') url_pattern = url_pattern.replace('///', '\\') try: - results = service.search_origin(url_pattern, offset, limit, - bool(strtobool(regexp)), - bool(strtobool(with_visit))) + (results, scroll_token) = service.search_origin( + url_pattern, limit, with_visit=bool(strtobool(with_visit))) results = json.dumps(list(results), sort_keys=True, indent=4, separators=(',', ': ')) diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -21,6 +21,7 @@ from swh.web.common.origin_visits import get_origin_visit from swh.web import config +search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() @@ -245,8 +246,7 @@ return map(converters.from_origin, origins) -def search_origin(url_pattern, offset=0, limit=50, regexp=False, - with_visit=False): +def search_origin(url_pattern, limit=50, scroll_token=None, with_visit=False): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. @@ -259,9 +259,11 @@ list of origin information as dict. """ - origins = storage.origin_search(url_pattern, offset, limit, regexp, - with_visit) - return map(converters.from_origin, origins) + results = search.origin_search(url_pattern=url_pattern, count=limit, + scroll_token=scroll_token, + with_visit=with_visit) + origins = map(converters.from_origin, results['results']) + return (origins, results['scroll_token']) def search_origin_metadata(fulltext, limit=50): diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -10,6 +10,7 @@ from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler +from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings @@ -32,6 +33,13 @@ 'timeout': 1, } }), + 'search': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://127.0.0.1:5010/', + 'timeout': 10, + }, + }), 'log_dir': ('string', '/tmp/swh/log'), 'debug': ('bool', False), 'serve_assets': ('bool', False), @@ -123,6 +131,7 @@ cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') + swhweb_config['search'] = get_search(**swhweb_config['search']) swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ @@ -132,6 +141,13 @@ return swhweb_config +def search(): + """Return the current application's search. + + """ + return get_config()['search'] + + def storage(): """Return the current application's storage. diff --git a/swh/web/settings/tests.py b/swh/web/settings/tests.py --- a/swh/web/settings/tests.py +++ b/swh/web/settings/tests.py @@ -95,7 +95,8 @@ }) from swh.web.tests.data import get_tests_data, override_storages # noqa test_data = get_tests_data() - override_storages(test_data['storage'], test_data['idx_storage']) + override_storages(test_data['storage'], test_data['idx_storage'], + test_data['search']) else: ALLOWED_HOSTS += ['testserver'] diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -454,22 +454,6 @@ assert {origin['url'] for origin in rv.data} == expected_origins -def test_api_origin_search_regexp(api_client): - expected_origins = { - 'https://github.com/memononen/libtess2', - 'repo_with_submodules' - } - - url = reverse('api-1-origin-search', - url_args={'url_pattern': '(repo|libtess)'}, - query_params={'limit': 10, - 'regexp': True}) - rv = api_client.get(url) - assert rv.status_code == 200, rv.data - assert rv['Content-Type'] == 'application/json' - assert {origin['url'] for origin in rv.data} == expected_origins - - @pytest.mark.parametrize('limit', [1, 2, 3, 10]) def test_api_origin_search_scroll(api_client, archive_data, limit): expected_origins = { @@ -486,8 +470,8 @@ assert {origin['url'] for origin in results} == expected_origins -def test_api_origin_search_limit(api_client, archive_data): - archive_data.origin_add([ +def test_api_origin_search_limit(api_client, archive_data, tests_data): + tests_data['search'].origin_update([ {'url': 'http://foobar/{}'.format(i)} for i in range(2000) ]) diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -95,7 +95,8 @@ data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module - override_storages(data['storage'], data['idx_storage']) + override_storages(data['storage'], data['idx_storage'], + data['search']) return data diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -20,6 +20,7 @@ from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS from swh.model.identifiers import directory_identifier from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.search import get_search from swh.storage.algos.dir_iterators import dir_iterator from swh.web import config from swh.web.browse.utils import ( @@ -244,6 +245,11 @@ # Create indexer storage instance that will be shared by indexers idx_storage = get_indexer_storage('memory', {}) + # Create search instance + search = get_search('memory', {}) + search.initialize() + search.origin_update({'url': origin['url']}for origin in _TEST_ORIGINS) + # Add the empty directory to the test archive empty_dir_id = directory_identifier({'entries': []}) empty_dir_id_bin = hash_to_bytes(empty_dir_id) @@ -253,6 +259,7 @@ return { 'storage': storage, 'idx_storage': idx_storage, + 'search': search, 'origins': _TEST_ORIGINS, 'contents': contents, 'directories': list(directories), @@ -311,17 +318,21 @@ return _current_tests_data -def override_storages(storage, idx_storage): +def override_storages(storage, idx_storage, search): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() - swh_config.update({'storage': storage}) - service.storage = storage + swh_config.update({ + 'storage': storage, + 'indexer_storage': idx_storage, + 'search': search, + }) - swh_config.update({'indexer_storage': idx_storage}) + service.storage = storage service.idx_storage = idx_storage + service.search = search # Implement some special endpoints used to provide input tests data