diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -133,6 +133,11 @@ 'regexp': regexp, 'with_visit': with_visit}) + def origin_count(self, url_pattern, regexp=False, with_visit=False): + return self.post('origin/count', {'url_pattern': url_pattern, + 'regexp': regexp, + 'with_visit': with_visit}) + def origin_get_range(self, origin_from=1, origin_count=100): return self.post('origin/get_range', {'origin_from': origin_from, 'origin_count': origin_count}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -236,6 +236,11 @@ return encode_data(get_storage().origin_search(**decode_request(request))) +@app.route('/origin/count', methods=['POST']) +def origin_count(): + return encode_data(get_storage().origin_count(**decode_request(request))) + + @app.route('/origin/add_multi', methods=['POST']) def origin_add(): return encode_data(get_storage().origin_add(**decode_request(request))) diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -661,6 +661,42 @@ cur.execute(query, query_params) yield from cur + def origin_count(self, url_pattern, regexp=False, + with_visit=False, cur=None): + """Count origins whose urls contain a provided string pattern + or match a provided regular expression. + The pattern search in origin urls is performed in a case insensitive + way. + + Args: + url_pattern (str): the string pattern to search for in origin urls + offset (int): number of found origins to skip before returning + results + limit (int): the maximum number of found origins to return + regexp (bool): if True, consider the provided pattern as a regular + expression and returns origins whose urls match it + with_visit (bool): if True, filter out origins with no visit + """ + cur = self._cursor(cur) + query = """SELECT COUNT(*) + FROM origin + WHERE """ + if with_visit: + query += """ + EXISTS (SELECT 1 from origin_visit WHERE origin=origin.id) + AND """ + query += 'url %s %%s' + + if not regexp: + query = query % 'ILIKE' + query_params = ('%'+url_pattern+'%',) + else: + query = query % '~*' + query_params = (url_pattern,) + + cur.execute(query, query_params) + return cur.fetchone()[0] + person_cols = ['fullname', 'name', 'email'] person_get_cols = person_cols + ['id'] diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -867,9 +867,31 @@ if with_visit: origins = [orig for orig in origins if len(self._origin_visits[orig['id']-1]) > 0] - origins = copy.deepcopy(origins[offset:offset+limit]) + if limit != 0: + origins = copy.deepcopy(origins[offset:offset+limit]) + else: + origins = copy.deepcopy(origins) return origins + def origin_count(self, url_pattern, regexp=False, with_visit=False, + db=None, cur=None): + """Count origins whose urls contain a provided string pattern + or match a provided regular expression. + The pattern search in origin urls is performed in a case insensitive + way. + + Args: + url_pattern (str): the string pattern to search for in origin urls + regexp (bool): if True, consider the provided pattern as a regular + expression and return origins whose urls match it + with_visit (bool): if True, filter out origins with no visit + + Returns: + int: The number of origins matching the search criterion. + """ + return len(self.origin_search(url_pattern, regexp=regexp, + with_visit=with_visit, limit=0)) + def origin_add(self, origins): """Add origins to the storage diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1143,6 +1143,25 @@ regexp, with_visit, cur): yield dict(zip(self.origin_keys, origin)) + @db_transaction() + def origin_count(self, url_pattern, regexp=False, + with_visit=False, db=None, cur=None): + """Count origins whose urls contain a provided string pattern + or match a provided regular expression. + The pattern search in origin urls is performed in a case insensitive + way. + + Args: + url_pattern (str): the string pattern to search for in origin urls + regexp (bool): if True, consider the provided pattern as a regular + expression and return origins whose urls match it + with_visit (bool): if True, filter out origins with no visit + + Returns: + int: The number of origins matching the search criterion. + """ + return db.origin_count(url_pattern, regexp, with_visit, cur) + @db_transaction_generator() def origin_get_range(self, origin_from=1, origin_count=100, db=None, cur=None): diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2186,6 +2186,40 @@ origin_count=origin_count)) self.assertEqual(len(origins), 0) + def test_origin_count(self): + + new_origins = [ + { + 'type': 'git', + 'url': 'https://github.com/user1/repo1' + }, + { + 'type': 'git', + 'url': 'https://github.com/user2/repo1' + }, + { + 'type': 'git', + 'url': 'https://github.com/user3/repo1' + }, + { + 'type': 'git', + 'url': 'https://gitlab.com/user1/repo1' + }, + { + 'type': 'git', + 'url': 'https://gitlab.com/user2/repo1' + } + ] + + self.storage.origin_add(new_origins) + + self.assertEqual(self.storage.origin_count('github'), 3) + self.assertEqual(self.storage.origin_count('gitlab'), 2) + self.assertEqual( + self.storage.origin_count('.*user.*', regexp=True), 5) + self.assertEqual( + self.storage.origin_count('.*user1.*', regexp=True), 2) + @pytest.mark.db class TestLocalStorage(CommonTestStorage, StorageTestDbFixture,