diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -4,3 +4,4 @@ swh.indexer >= 0.0.120 swh.scheduler >= 0.0.31 swh.core >= 0.0.81 +swh.search >= 0.0.3 diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -23,6 +23,7 @@ from swh.web.common.origin_visits import get_origin_visit from swh.web import config +search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() @@ -260,25 +261,33 @@ list of origin information as dict. """ - offset = int(page_token) if page_token else 0 - regexp = True - search_words = [re.escape(word) for word in url_pattern.split()] - if len(search_words) >= 7: - url_pattern = '.*'.join(search_words) + if search: + results = search.origin_search(url_pattern=url_pattern, count=limit, + page_token=page_token, + with_visit=with_visit) + origins = list(map(converters.from_origin, results['results'])) + return (origins, results['next_page_token']) else: - pattern_parts = [] - for permut in itertools.permutations(search_words): - pattern_parts.append('.*'.join(permut)) - url_pattern = '|'.join(pattern_parts) - - origins = storage.origin_search(url_pattern, offset, limit, regexp, - with_visit) - origins = list(map(converters.from_origin, origins)) - if len(origins) >= limit: - page_token = str(offset + len(origins)) - else: - page_token = None - return (origins, page_token) + # Fallback to swh-storage if swh-search is not configured + offset = int(page_token) if page_token else 0 + regexp = True + search_words = [re.escape(word) for word in url_pattern.split()] + if len(search_words) >= 7: + url_pattern = '.*'.join(search_words) + else: + pattern_parts = [] + for permut in itertools.permutations(search_words): + pattern_parts.append('.*'.join(permut)) + url_pattern = '|'.join(pattern_parts) + + origins = storage.origin_search(url_pattern, offset, limit, regexp, + with_visit) + origins = list(map(converters.from_origin, origins)) + if len(origins) >= limit: + page_token = str(offset + len(origins)) + else: + page_token = None + return (origins, page_token) def search_origin_metadata(fulltext, limit=50): diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -10,6 +10,7 @@ from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler +from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings @@ -18,6 +19,13 @@ DEFAULT_CONFIG = { 'allowed_hosts': ('list', []), + 'search': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://127.0.0.1:5010/', + 'timeout': 10, + }, + }), 'storage': ('dict', { 'cls': 'remote', 'url': 'http://127.0.0.1:5002/', @@ -128,6 +136,10 @@ cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') + if swhweb_config.get('search'): + swhweb_config['search'] = get_search(**swhweb_config['search']) + else: + swhweb_config['search'] = None swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ @@ -137,6 +149,13 @@ return swhweb_config +def search(): + """Return the current application's search. + + """ + return get_config()['search'] + + def storage(): """Return the current application's storage. diff --git a/swh/web/settings/tests.py b/swh/web/settings/tests.py --- a/swh/web/settings/tests.py +++ b/swh/web/settings/tests.py @@ -101,7 +101,8 @@ }) from swh.web.tests.data import get_tests_data, override_storages # noqa test_data = get_tests_data() - override_storages(test_data['storage'], test_data['idx_storage']) + override_storages(test_data['storage'], test_data['idx_storage'], + test_data['search']) else: ALLOWED_HOSTS += ['testserver'] diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -390,7 +390,12 @@ } -def test_api_origin_search(api_client): +@pytest.mark.parametrize('backend', ['swh-search', 'swh-storage']) +def test_api_origin_search(api_client, mocker, backend): + if backend != 'swh-search': + # equivalent to not configuring search in the config + mocker.patch('swh.web.common.service.search', None) + expected_origins = { 'https://github.com/wcoder/highlightjs-line-numbers.js', 'https://github.com/memononen/libtess2', @@ -425,7 +430,12 @@ assert {origin['url'] for origin in rv.data} == expected_origins -def test_api_origin_search_words(api_client): +@pytest.mark.parametrize('backend', ['swh-search', 'swh-storage']) +def test_api_origin_search_words(api_client, mocker, backend): + if backend != 'swh-search': + # equivalent to not configuring search in the config + mocker.patch('swh.web.common.service.search', None) + expected_origins = { 'https://github.com/wcoder/highlightjs-line-numbers.js', 'https://github.com/memononen/libtess2', @@ -468,8 +478,15 @@ == {'https://github.com/memononen/libtess2'} +@pytest.mark.parametrize('backend', ['swh-search', 'swh-storage']) @pytest.mark.parametrize('limit', [1, 2, 3, 10]) -def test_api_origin_search_scroll(api_client, archive_data, limit): +def test_api_origin_search_scroll( + api_client, archive_data, mocker, limit, backend): + + if backend != 'swh-search': + # equivalent to not configuring search in the config + mocker.patch('swh.web.common.service.search', None) + expected_origins = { 'https://github.com/wcoder/highlightjs-line-numbers.js', 'https://github.com/memononen/libtess2', @@ -484,11 +501,22 @@ assert {origin['url'] for origin in results} == expected_origins -def test_api_origin_search_limit(api_client, archive_data): - archive_data.origin_add([ - {'url': 'http://foobar/{}'.format(i)} - for i in range(2000) - ]) +@pytest.mark.parametrize('backend', ['swh-search', 'swh-storage']) +def test_api_origin_search_limit( + api_client, archive_data, tests_data, mocker, backend): + if backend == 'swh-search': + tests_data['search'].origin_update([ + {'url': 'http://foobar/{}'.format(i)} + for i in range(2000) + ]) + else: + # equivalent to not configuring search in the config + mocker.patch('swh.web.common.service.search', None) + + archive_data.origin_add([ + {'url': 'http://foobar/{}'.format(i)} + for i in range(2000) + ]) url = reverse('api-1-origin-search', url_args={'url_pattern': 'foobar'}, diff --git a/swh/web/tests/conftest.py b/swh/web/tests/conftest.py --- a/swh/web/tests/conftest.py +++ b/swh/web/tests/conftest.py @@ -106,7 +106,7 @@ data = get_tests_data(reset=True) # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module - override_storages(data['storage'], data['idx_storage']) + override_storages(data['storage'], data['idx_storage'], data['search']) return data diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -20,6 +20,7 @@ from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS from swh.model.identifiers import directory_identifier from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.search import get_search from swh.storage.algos.dir_iterators import dir_iterator from swh.web import config from swh.web.browse.utils import ( @@ -159,6 +160,11 @@ # To hold reference to the memory storage storage = None + # Create search instance + search = get_search('memory', {}) + search.initialize() + search.origin_update({'url': origin['url']} for origin in _TEST_ORIGINS) + # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive in enumerate(origin['archives']): @@ -176,10 +182,12 @@ loader.load() origin.update(storage.origin_get(origin)) # add an 'id' key if enabled + search.origin_update([{'url': origin['url'], 'has_visits': True}]) for i in range(250): url = 'https://many.origins/%d' % (i+1) storage.origin_add([{'url': url}]) + search.origin_update([{'url': url, 'has_visits': True}]) visit = storage.origin_visit_add(url, '2019-12-03 13:55:05', 'tar') storage.origin_visit_update( url, visit['visit'], @@ -250,6 +258,7 @@ # Return tests data return { + 'search': search, 'storage': storage, 'idx_storage': idx_storage, 'origins': _TEST_ORIGINS, @@ -310,17 +319,21 @@ return _current_tests_data -def override_storages(storage, idx_storage): +def override_storages(storage, idx_storage, search): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() - swh_config.update({'storage': storage}) - service.storage = storage + swh_config.update({ + 'storage': storage, + 'indexer_storage': idx_storage, + 'search': search, + }) - swh_config.update({'indexer_storage': idx_storage}) + service.storage = storage service.idx_storage = idx_storage + service.search = search # Implement some special endpoints used to provide input tests data