diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -3,3 +3,4 @@ swh.vault >= 0.0.23 swh.indexer >= 0.0.120 swh.scheduler >= 0.0.31 +swh.search >= 0.0.1 diff --git a/swh/web/api/views/origin.py b/swh/web/api/views/origin.py --- a/swh/web/api/views/origin.py +++ b/swh/web/api/views/origin.py @@ -193,10 +193,8 @@ and only the Link header should be used for paginating through results. - :param string url_pattern: a string pattern or a regular expression + :param string url_pattern: a string pattern :query int limit: the maximum number of found origins to return - :query boolean regexp: if true, consider provided pattern as a regular - expression and search origins whose urls match it :query boolean with_visit: if true, only return origins with at least one visit by Software heritage @@ -217,21 +215,19 @@ :swh_web_api:`origin/search/python/?limit=2` """ result = {} - offset = int(request.query_params.get('offset', '0')) limit = min(int(request.query_params.get('limit', '70')), 100) - regexp = request.query_params.get('regexp', 'false') + scroll_token = request.query_params.get('scroll_token') with_visit = request.query_params.get('with_visit', 'false') - results = api_lookup(service.search_origin, url_pattern, offset, limit, - bool(strtobool(regexp)), bool(strtobool(with_visit)), - enrich_fn=_enrich_origin) + (results, scroll_token) = api_lookup( + service.search_origin, url_pattern, limit, + scroll_token, bool(strtobool(with_visit)), + enrich_fn=_enrich_origin) - nb_results = len(results) - if nb_results == limit: + if scroll_token is not None: query_params = {} - query_params['offset'] = offset + limit query_params['limit'] = limit - query_params['regexp'] = regexp + query_params['scroll_token'] = scroll_token result['headers'] = { 'link-next': reverse('api-1-origin-search', @@ -240,7 +236,7 @@ } result.update({ - 'results': results + 'results': list(results) }) return result diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -17,6 +17,7 @@ from swh.web.common.origin_visits import get_origin_visit from swh.web import config +search = config.search() storage = config.storage() vault = config.vault() idx_storage = config.indexer_storage() @@ -241,8 +242,7 @@ return map(converters.from_origin, origins) -def search_origin(url_pattern, offset=0, limit=50, regexp=False, - with_visit=False): +def search_origin(url_pattern, limit=50, scroll_token=None, with_visit=False): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. @@ -255,9 +255,11 @@ list of origin information as dict. """ - origins = storage.origin_search(url_pattern, offset, limit, regexp, - with_visit) - return map(converters.from_origin, origins) + results = search.origin_search(url_pattern=url_pattern, count=limit, + scroll_token=scroll_token, + with_visit=with_visit) + origins = map(converters.from_origin, results['results']) + return (origins, results['scroll_token']) def search_origin_metadata(fulltext, limit=50): diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -8,6 +8,7 @@ from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler +from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings @@ -30,6 +31,13 @@ 'timeout': 1, } }), + 'search': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://127.0.0.1:5010/', + 'timeout': 10, + }, + }), 'log_dir': ('string', '/tmp/swh/log'), 'debug': ('bool', False), 'serve_assets': ('bool', False), @@ -121,6 +129,7 @@ cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, 'log_dir') + swhweb_config['search'] = get_search(**swhweb_config['search']) swhweb_config['storage'] = get_storage(**swhweb_config['storage']) swhweb_config['vault'] = get_vault(**swhweb_config['vault']) swhweb_config['indexer_storage'] = \ @@ -130,6 +139,13 @@ return swhweb_config +def search(): + """Return the current application's search. + + """ + return get_config()['search'] + + def storage(): """Return the current application's storage. diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -316,6 +316,7 @@ rv = self.client.get(url) + self.maxDiff = None self.assertEqual(rv.status_code, 404, rv.data) self.assertEqual(rv['Content-Type'], 'application/json') self.assertEqual(rv.data, { @@ -402,7 +403,6 @@ 'reason': 'Origin with url %s not found!' % new_origin['url'] }) - @pytest.mark.origin_id def test_api_origin_search(self): expected_origins = { 'https://github.com/wcoder/highlightjs-line-numbers.js', @@ -440,24 +440,6 @@ self.assertEqual({origin['url'] for origin in rv.data}, expected_origins) - @pytest.mark.origin_id - def test_api_origin_search_regexp(self): - expected_origins = { - 'https://github.com/memononen/libtess2', - 'repo_with_submodules' - } - - url = reverse('api-1-origin-search', - url_args={'url_pattern': '(repo|libtess)'}, - query_params={'limit': 10, - 'regexp': True}) - rv = self.client.get(url) - self.assertEqual(rv.status_code, 200, rv.data) - self.assertEqual(rv['Content-Type'], 'application/json') - self.assertEqual({origin['url'] for origin in rv.data}, - expected_origins) - - @pytest.mark.origin_id @given(strategies.integers(min_value=1)) def test_api_origin_search_scroll(self, limit): expected_origins = { @@ -475,7 +457,7 @@ expected_origins) def test_api_origin_search_limit(self): - self.storage.origin_add([ + self.search.origin_update([ {'url': 'http://foobar/{}'.format(i)} for i in range(200) ]) diff --git a/swh/web/tests/data.py b/swh/web/tests/data.py --- a/swh/web/tests/data.py +++ b/swh/web/tests/data.py @@ -19,6 +19,7 @@ from swh.model.hashutil import hash_to_hex, hash_to_bytes, DEFAULT_ALGORITHMS from swh.model.identifiers import directory_identifier from swh.loader.git.from_disk import GitLoaderFromArchive +from swh.search import get_search from swh.storage.algos.dir_iterators import dir_iterator from swh.web import config from swh.web.browse.utils import ( @@ -243,6 +244,11 @@ # Create indexer storage instance that will be shared by indexers idx_storage = get_indexer_storage('memory', {}) + # Create search instance + search = get_search('memory', {}) + search.initialize() + search.origin_update({'url': origin['url']}for origin in _TEST_ORIGINS) + # Add the empty directory to the test archive empty_dir_id = directory_identifier({'entries': []}) empty_dir_id_bin = hash_to_bytes(empty_dir_id) @@ -252,6 +258,7 @@ return { 'storage': storage, 'idx_storage': idx_storage, + 'search': search, 'origins': _TEST_ORIGINS, 'contents': contents, 'directories': list(directories), @@ -310,17 +317,21 @@ return _current_tests_data -def override_storages(storage, idx_storage): +def override_storages(storage, idx_storage, search): """ Helper function to replace the storages from which archive data are fetched. """ swh_config = config.get_config() - swh_config.update({'storage': storage}) - service.storage = storage + swh_config.update({ + 'storage': storage, + 'indexer_storage': idx_storage, + 'search': search, + }) - swh_config.update({'indexer_storage': idx_storage}) + service.storage = storage service.idx_storage = idx_storage + service.search = search # Implement some special endpoints used to provide input tests data diff --git a/swh/web/tests/testcase.py b/swh/web/tests/testcase.py --- a/swh/web/tests/testcase.py +++ b/swh/web/tests/testcase.py @@ -37,13 +37,16 @@ tests_data = get_tests_data(reset=True) self.storage = tests_data['storage'] self.idx_storage = tests_data['idx_storage'] + self.search = tests_data['search'] self.mimetype_indexer = tests_data['mimetype_indexer'] self.license_indexer = tests_data['license_indexer'] self.ctags_indexer = tests_data['ctags_indexer'] # Update swh-web configuration to use the in-memory storages # instantiated in the tests.data module - override_storages(tests_data['storage'], tests_data['idx_storage']) + override_storages( + tests_data['storage'], tests_data['idx_storage'], + tests_data['search']) super()._pre_setup()