diff --git a/swh/web/assets/src/bundles/browse/origin-search.js b/swh/web/assets/src/bundles/browse/origin-search.js --- a/swh/web/assets/src/bundles/browse/origin-search.js +++ b/swh/web/assets/src/bundles/browse/origin-search.js @@ -5,7 +5,6 @@ * See top-level LICENSE file for more information */ -import {heapsPermute} from 'utils/heaps-permute'; import {handleFetchError} from 'utils/functions'; const limit = 100; @@ -86,35 +85,17 @@ }); } -function escapeStringRegexp(str) { - let matchOperatorsRe = /[|\\{}()[\]^$+*?.]/g; - return str.replace(matchOperatorsRe, '%5C$&'); -} - -function searchOriginsFirst(patterns, limit) { +function searchOriginsFirst(searchQueryText, limit) { let baseSearchUrl; let searchMetadata = $('#swh-search-origin-metadata').prop('checked'); if (searchMetadata) { - baseSearchUrl = Urls.api_1_origin_metadata_search() + `?fulltext=${patterns}`; + baseSearchUrl = Urls.api_1_origin_metadata_search() + '?fulltext=' + encodeURIComponent(searchQueryText); } else { - let patternsArray = patterns.trim().replace(/\s+/g, ' ').split(' '); - for (let i = 0; i < patternsArray.length; ++i) { - patternsArray[i] = escapeStringRegexp(patternsArray[i]); - } - // url length must be less than 4096 for modern browsers - // assuming average word length, 6 is max patternArray.length - if (patternsArray.length < 7) { - let patternsPermut = []; - heapsPermute(patternsArray, p => patternsPermut.push(p.join('.*'))); - let regex = patternsPermut.join('|'); - baseSearchUrl = Urls.api_1_origin_search(regex) + `?regexp=true`; - } else { - baseSearchUrl = Urls.api_1_origin_search(patternsArray.join('.*')) + `?regexp=true`; - } + baseSearchUrl = Urls.api_1_origin_search(searchQueryText); } let withVisit = $('#swh-search-origins-with-visit').prop('checked'); - let searchUrl = baseSearchUrl + `&limit=${limit}&with_visit=${withVisit}`; + let searchUrl = baseSearchUrl + `?limit=${limit}&with_visit=${withVisit}`; searchOrigins(searchUrl); } @@ -155,10 +136,10 @@ function doSearch() { $('#swh-no-result').hide(); - let patterns = $('#origins-url-patterns').val(); + let searchQueryText = $('#origins-url-patterns').val(); inSearch = true; // first try to resolve a swh persistent identifier - let resolvePidUrl = Urls.api_1_resolve_swh_pid(patterns); + let resolvePidUrl = Urls.api_1_resolve_swh_pid(searchQueryText); fetch(resolvePidUrl) .then(handleFetchError) .then(response => response.json()) @@ -169,7 +150,7 @@ }) .catch(response => { // pid resolving failed - if (patterns.startsWith('swh:')) { + if (searchQueryText.startsWith('swh:')) { // display a useful error message if the input // looks like a swh pid response.json().then(data => { @@ -182,7 +163,7 @@ // otherwise, proceed with origins search $('#swh-origin-search-results').show(); $('.swh-search-pagination').show(); - searchOriginsFirst(patterns, limit); + searchOriginsFirst(searchQueryText, limit); } }); } @@ -191,11 +172,11 @@ $(document).ready(() => { $('#swh-search-origins').submit(event => { event.preventDefault(); - let patterns = $('#origins-url-patterns').val().trim(); + let searchQueryText = $('#origins-url-patterns').val().trim(); let withVisit = $('#swh-search-origins-with-visit').prop('checked'); let withContent = $('#swh-filter-empty-visits').prop('checked'); let searchMetadata = $('#swh-search-origin-metadata').prop('checked'); - let queryParameters = '?q=' + encodeURIComponent(patterns); + let queryParameters = '?q=' + encodeURIComponent(searchQueryText); if (withVisit) { queryParameters += '&with_visit'; } diff --git a/swh/web/assets/src/utils/heaps-permute.js b/swh/web/assets/src/utils/heaps-permute.js deleted file mode 100644 --- a/swh/web/assets/src/utils/heaps-permute.js +++ /dev/null @@ -1,32 +0,0 @@ -/** - * Copyright (C) 2018 The Software Heritage developers - * See the AUTHORS file at the top-level directory of this distribution - * License: GNU Affero General Public License version 3, or any later version - * See top-level LICENSE file for more information - */ - -// http://dsernst.com/2014/12/14/heaps-permutation-algorithm-in-javascript/ - -function swap(array, pos1, pos2) { - let temp = array[pos1]; - array[pos1] = array[pos2]; - array[pos2] = temp; -} - -export function heapsPermute(array, output, n) { - n = n || array.length; // set n default to array.length - if (n === 1) { - output(array); - } else { - for (let i = 1; i <= n; i += 1) { - heapsPermute(array, output, n - 1); - let j; - if (n % 2) { - j = 1; - } else { - j = i; - } - swap(array, j - 1, n - 1); // -1 to account for javascript zero-indexing - } - } -} diff --git a/swh/web/common/service.py b/swh/web/common/service.py --- a/swh/web/common/service.py +++ b/swh/web/common/service.py @@ -3,7 +3,9 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +import itertools import os +import re from collections import defaultdict from typing import Any, Dict @@ -259,6 +261,18 @@ list of origin information as dict. """ + if not regexp: + # If the query is not a regexp, rewrite it as a regexp. + regexp = True + search_words = [re.escape(word) for word in url_pattern.split()] + if len(search_words) >= 7: + url_pattern = '.*'.join(search_words) + else: + pattern_parts = [] + for permut in itertools.permutations(search_words): + pattern_parts.append('.*'.join(permut)) + url_pattern = '|'.join(pattern_parts) + origins = storage.origin_search(url_pattern, offset, limit, regexp, with_visit) return map(converters.from_origin, origins) diff --git a/swh/web/tests/api/views/test_origin.py b/swh/web/tests/api/views/test_origin.py --- a/swh/web/tests/api/views/test_origin.py +++ b/swh/web/tests/api/views/test_origin.py @@ -454,6 +454,49 @@ assert {origin['url'] for origin in rv.data} == expected_origins +def test_api_origin_search_words(api_client): + expected_origins = { + 'https://github.com/wcoder/highlightjs-line-numbers.js', + 'https://github.com/memononen/libtess2', + } + + url = reverse('api-1-origin-search', + url_args={'url_pattern': 'github com'}, + query_params={'limit': 2}) + rv = api_client.get(url) + assert rv.status_code == 200, rv.data + assert rv['Content-Type'] == 'application/json' + assert {origin['url'] for origin in rv.data} == expected_origins + + url = reverse('api-1-origin-search', + url_args={'url_pattern': 'com github'}, + query_params={'limit': 2}) + rv = api_client.get(url) + assert rv.status_code == 200, rv.data + assert rv['Content-Type'] == 'application/json' + assert {origin['url'] for origin in rv.data} == expected_origins + + url = reverse('api-1-origin-search', + url_args={'url_pattern': 'memononen libtess2'}, + query_params={'limit': 2}) + rv = api_client.get(url) + assert rv.status_code == 200, rv.data + assert rv['Content-Type'] == 'application/json' + assert len(rv.data) == 1 + assert {origin['url'] for origin in rv.data} \ + == {'https://github.com/memononen/libtess2'} + + url = reverse('api-1-origin-search', + url_args={'url_pattern': 'libtess2 memononen'}, + query_params={'limit': 2}) + rv = api_client.get(url) + assert rv.status_code == 200, rv.data + assert rv['Content-Type'] == 'application/json' + assert len(rv.data) == 1 + assert {origin['url'] for origin in rv.data} \ + == {'https://github.com/memononen/libtess2'} + + def test_api_origin_search_regexp(api_client): expected_origins = { 'https://github.com/memononen/libtess2',