diff --git a/sql/upgrades/122.sql b/sql/upgrades/122.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/122.sql @@ -0,0 +1,9 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 121 +-- to_version: 122 +-- description: add index to search origin_intrinsic_metadata for mappings. + +insert into dbversion(version, release, description) +values(122, now(), 'Work In Progress'); + +create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql --- a/swh/indexer/sql/30-swh-schema.sql +++ b/swh/indexer/sql/30-swh-schema.sql @@ -14,7 +14,7 @@ ); insert into dbversion(version, release, description) - values(121, now(), 'Work In Progress'); + values(122, now(), 'Work In Progress'); -- Computing metadata on sha1's contents -- a SHA1 checksum (not necessarily originating from Git) diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql --- a/swh/indexer/sql/60-swh-indexes.sql +++ b/swh/indexer/sql/60-swh-indexes.sql @@ -66,3 +66,4 @@ alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey; create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); +create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -739,6 +739,44 @@ yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) + @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') + @db_transaction_generator() + def origin_intrinsic_metadata_search_by_producer( + self, start=0, end=None, limit=100, ids_only=False, + mappings=None, + db=None, cur=None): + """Returns the list of origins whose metadata contain all the terms. + + Args: + start (int): The minimum origin id to return + end (int): The maximum origin id to return + limit (int): The maximum number of results to return + ids_only (bool): Determines whether only origin ids are returned + or the content as well + mappings (List[str]): Returns origins whose intrinsic metadata + were generated using at least one of these mappings. + + Yields: + list: list of origin ids (int) if `ids_only=True`, else + dictionaries with the following keys: + + - **id** (int) + - **metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata + + """ + res = db.origin_intrinsic_metadata_search_by_producer( + start, end, limit, ids_only, mappings, cur) + if ids_only: + for (origin_id,) in res: + yield origin_id + else: + for c in res: + yield converters.db_to_metadata( + dict(zip(db.origin_intrinsic_metadata_cols, c))) + @remote_api_endpoint('origin_intrinsic_metadata/stats') @db_transaction() def origin_intrinsic_metadata_stats( diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -348,8 +348,7 @@ self.origin_intrinsic_metadata_cols, cur=cur, id_col='origin_id') - def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, - cur=None): + def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur): regconfig = self.origin_intrinsic_metadata_regconfig tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig for _ in terms) @@ -368,6 +367,42 @@ cur.execute(query, tsquery_args + [limit]) yield from cur + def origin_intrinsic_metadata_search_by_producer( + self, start, end, limit, ids_only, mappings, cur): + if ids_only: + keys = 'oim.origin_id' + else: + keys = ', '.join(map(self._convert_key, + self.origin_intrinsic_metadata_cols)) + query_parts = [ + "SELECT %s" % keys, + "FROM origin_intrinsic_metadata AS oim", + "INNER JOIN indexer_configuration AS i", + "ON oim.indexer_configuration_id=i.id", + ] + args = [] + + where = [] + if start: + where.append('oim.origin_id >= %s') + args.append(start) + if end: + where.append('oim.origin_id <= %s') + args.append(end) + if mappings is not None: + where.append('oim.mappings && %s') + args.append(mappings) + if where: + query_parts.append('WHERE') + query_parts.append(' AND '.join(where)) + + if limit: + query_parts.append('LIMIT %s') + args.append(limit) + + cur.execute(' '.join(query_parts), args) + yield from cur + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', 'tool_configuration'] diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -680,6 +680,50 @@ result['origin_id'] = result.pop('id') yield result + def origin_intrinsic_metadata_search_by_producer( + self, start=0, end=None, limit=100, ids_only=False, + mappings=None, + db=None, cur=None): + """Returns the list of origins whose metadata contain all the terms. + + Args: + start (int): The minimum origin id to return + end (int): The maximum origin id to return + limit (int): The maximum number of results to return + ids_only (bool): Determines whether only origin ids are returned + or the content as well + mappings (List[str]): Returns origins whose intrinsic metadata + were generated using at least one of these mappings. + + Yields: + list: list of origin ids (int) if `ids_only=True`, else + dictionaries with the following keys: + + - **id** (int) + - **metadata** (str): associated metadata + - **tool** (dict): tool used to compute metadata + - **mappings** (List[str]): list of mappings used to translate + these metadata + + """ + nb_results = 0 + if mappings is not None: + mappings = frozenset(mappings) + for entry in self._origin_intrinsic_metadata.get_all(): + if entry['id'] < start or (end and entry['id'] > end): + continue + if nb_results >= limit: + return + if mappings is not None and mappings.isdisjoint(entry['mappings']): + continue + if ids_only: + yield entry['id'] + else: + entry = entry.copy() + entry['origin_id'] = entry.pop('id') + yield entry + nb_results += 1 + def origin_intrinsic_metadata_stats(self): """Returns statistics on stored intrinsic metadata. diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -324,9 +324,9 @@ '7026b7c1a2af56521e9587659012345678904321') self.revision_id_3 = hash_to_bytes( '7026b7c1a2af56521e9587659012345678904320') - self.origin_id_1 = 54974445 + self.origin_id_1 = 44434341 self.origin_id_2 = 44434342 - self.origin_id_3 = 44434341 + self.origin_id_3 = 54974445 def test_check_config(self): self.assertTrue(self.storage.check_config(check_write=True)) @@ -1173,8 +1173,7 @@ [res['origin_id'] for res in search(['John', 'Jane'])], [self.origin_id_1]) - def test_origin_intrinsic_metadata_stats(self): - # given + def _fill_origin_intrinsic_metadata(self): tool_id = self.tools['swh-metadata-detector']['id'] metadata1 = { @@ -1228,7 +1227,6 @@ 'from_revision': self.revision_id_3, } - # when self.storage.revision_metadata_add([metadata1_rev]) self.storage.origin_intrinsic_metadata_add([metadata1_origin]) self.storage.revision_metadata_add([metadata2_rev]) @@ -1236,7 +1234,66 @@ self.storage.revision_metadata_add([metadata3_rev]) self.storage.origin_intrinsic_metadata_add([metadata3_origin]) - # then + def test_origin_intrinsic_metadata_search_by_producer(self): + self._fill_origin_intrinsic_metadata() + tool = self.tools['swh-metadata-detector'] + endpoint = self.storage.origin_intrinsic_metadata_search_by_producer + + # test pagination + self.assertCountEqual( + endpoint(ids_only=True), + [self.origin_id_1, self.origin_id_2, self.origin_id_3]) + self.assertCountEqual( + endpoint(start=0, ids_only=True), + [self.origin_id_1, self.origin_id_2, self.origin_id_3]) + self.assertCountEqual( + endpoint(start=0, limit=2, ids_only=True), + [self.origin_id_1, self.origin_id_2]) + self.assertCountEqual( + endpoint(start=self.origin_id_1+1, ids_only=True), + [self.origin_id_2, self.origin_id_3]) + self.assertCountEqual( + endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1, + ids_only=True), + [self.origin_id_2]) + + # test mappings filtering + self.assertCountEqual( + endpoint(mappings=['npm'], ids_only=True), + [self.origin_id_1, self.origin_id_2]) + self.assertCountEqual( + endpoint(mappings=['npm', 'gemspec'], ids_only=True), + [self.origin_id_1, self.origin_id_2]) + self.assertCountEqual( + endpoint(mappings=['gemspec'], ids_only=True), + [self.origin_id_2]) + self.assertCountEqual( + endpoint(mappings=['pkg-info'], ids_only=True), + [self.origin_id_3]) + self.assertCountEqual( + endpoint(mappings=['foobar'], ids_only=True), + []) + + # test pagination + mappings + self.assertCountEqual( + endpoint(mappings=['npm'], limit=1, ids_only=True), + [self.origin_id_1]) + + # test ids_only=False + self.assertEqual(list(endpoint(mappings=['gemspec'])), [{ + 'origin_id': self.origin_id_2, + 'metadata': { + '@context': 'foo', + 'author': 'Jane Doe', + }, + 'mappings': ['npm', 'gemspec'], + 'tool': tool, + 'from_revision': self.revision_id_2, + }]) + + def test_origin_intrinsic_metadata_stats(self): + self._fill_origin_intrinsic_metadata() + result = self.storage.origin_intrinsic_metadata_stats() self.assertEqual(result, { 'per_mapping': {