diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -123,11 +123,13 @@ def origin_get(self, origin): return self.post('origin/get', {'origin': origin}) - def origin_search(self, url_pattern, offset=0, limit=50, regexp=False): + def origin_search(self, url_pattern, offset=0, limit=50, regexp=False, + with_visit=False): return self.post('origin/search', {'url_pattern': url_pattern, 'offset': offset, 'limit': limit, - 'regexp': regexp}) + 'regexp': regexp, + 'with_visit': with_visit}) def origin_add(self, origins): return self.post('origin/add_multi', {'origins': origins}) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -863,23 +863,32 @@ return None def origin_search(self, url_pattern, offset=0, limit=50, - regexp=False, cur=None): + regexp=False, with_visit=False, cur=None): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. Args: - url_pattern: the string pattern to search for in origin urls - offset: number of found origins to skip before returning results - limit: the maximum number of found origins to return - regexp: if True, consider the provided pattern as a regular + url_pattern (str): the string pattern to search for in origin urls + offset (int): number of found origins to skip before returning + results + limit (int): the maximum number of found origins to return + regexp (bool): if True, consider the provided pattern as a regular expression and returns origins whose urls match it + with_visit (bool): if True, filter out origins with no visit """ cur = self._cursor(cur) origin_cols = ','.join(self.origin_cols) query = """SELECT %s - FROM origin WHERE url %s %%s + FROM origin + WHERE """ + if with_visit: + query += """ + EXISTS (SELECT 1 from origin_visit WHERE origin=origin.id) + AND """ + query += """ + url %s %%s ORDER BY id OFFSET %%s LIMIT %%s""" diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1128,24 +1128,26 @@ @db_transaction_generator() def origin_search(self, url_pattern, offset=0, limit=50, - regexp=False, db=None, cur=None): + regexp=False, with_visit=False, db=None, cur=None): """Search for origins whose urls contain a provided string pattern or match a provided regular expression. The search is performed in a case insensitive way. Args: - url_pattern: the string pattern to search for in origin urls - offset: number of found origins to skip before returning results - limit: the maximum number of found origins to return - regexp: if True, consider the provided pattern as a regular + url_pattern (str): the string pattern to search for in origin urls + offset (int): number of found origins to skip before returning + results + limit (int): the maximum number of found origins to return + regexp (bool): if True, consider the provided pattern as a regular expression and return origins whose urls match it + with_visit (bool): if True, filter out origins with no visit Returns: An iterable of dict containing origin information as returned by :meth:`swh.storage.storage.Storage.origin_get`. """ for origin in db.origin_search(url_pattern, offset, limit, - regexp, cur): + regexp, with_visit, cur): yield dict(zip(self.origin_keys, origin)) @db_transaction()