Page MenuHomeSoftware Heritage

D1150.id3623.diff
No OneTemporary

D1150.id3623.diff

diff --git a/sql/upgrades/122.sql b/sql/upgrades/122.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/122.sql
@@ -0,0 +1,9 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 121
+-- to_version: 122
+-- description: add index to search origin_intrinsic_metadata for mappings.
+
+insert into dbversion(version, release, description)
+values(122, now(), 'Work In Progress');
+
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(121, now(), 'Work In Progress');
+ values(122, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql
--- a/swh/indexer/sql/60-swh-indexes.sql
+++ b/swh/indexer/sql/60-swh-indexes.sql
@@ -66,3 +66,4 @@
alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey;
create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -739,6 +739,44 @@
yield converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c)))
+ @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
+ @db_transaction_generator()
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start=0, end=None, limit=100, ids_only=False,
+ mappings=None,
+ db=None, cur=None):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ start (int): The minimum origin id to return
+ end (int): The maximum origin id to return
+ limit (int): The maximum number of results to return
+ ids_only (bool): Determines whether only origin ids are returned
+ or the content as well
+ mappings (List[str]): Returns origins whose intrinsic metadata
+ were generated using at least one of these mappings.
+
+ Yields:
+ list: list of origin ids (int) if `ids_only=True`, else
+ dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
+
+ """
+ res = db.origin_intrinsic_metadata_search_by_producer(
+ start, end, limit, ids_only, mappings, cur)
+ if ids_only:
+ for (origin_id,) in res:
+ yield origin_id
+ else:
+ for c in res:
+ yield converters.db_to_metadata(
+ dict(zip(db.origin_intrinsic_metadata_cols, c)))
+
@remote_api_endpoint('origin_intrinsic_metadata/stats')
@db_transaction()
def origin_intrinsic_metadata_stats(
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -348,8 +348,7 @@
self.origin_intrinsic_metadata_cols, cur=cur,
id_col='origin_id')
- def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit,
- cur=None):
+ def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur):
regconfig = self.origin_intrinsic_metadata_regconfig
tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig
for _ in terms)
@@ -368,6 +367,42 @@
cur.execute(query, tsquery_args + [limit])
yield from cur
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start, end, limit, ids_only, mappings, cur):
+ if ids_only:
+ keys = 'oim.origin_id'
+ else:
+ keys = ', '.join(map(self._convert_key,
+ self.origin_intrinsic_metadata_cols))
+ query_parts = [
+ "SELECT %s" % keys,
+ "FROM origin_intrinsic_metadata AS oim",
+ "INNER JOIN indexer_configuration AS i",
+ "ON oim.indexer_configuration_id=i.id",
+ ]
+ args = []
+
+ where = []
+ if start:
+ where.append('oim.origin_id >= %s')
+ args.append(start)
+ if end:
+ where.append('oim.origin_id <= %s')
+ args.append(end)
+ if mappings is not None:
+ where.append('oim.mappings && %s')
+ args.append(mappings)
+ if where:
+ query_parts.append('WHERE')
+ query_parts.append(' AND '.join(where))
+
+ if limit:
+ query_parts.append('LIMIT %s')
+ args.append(limit)
+
+ cur.execute(' '.join(query_parts), args)
+ yield from cur
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -680,6 +680,50 @@
result['origin_id'] = result.pop('id')
yield result
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start=0, end=None, limit=100, ids_only=False,
+ mappings=None,
+ db=None, cur=None):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ start (int): The minimum origin id to return
+ end (int): The maximum origin id to return
+ limit (int): The maximum number of results to return
+ ids_only (bool): Determines whether only origin ids are returned
+ or the content as well
+ mappings (List[str]): Returns origins whose intrinsic metadata
+ were generated using at least one of these mappings.
+
+ Yields:
+ list: list of origin ids (int) if `ids_only=True`, else
+ dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
+
+ """
+ nb_results = 0
+ if mappings is not None:
+ mappings = frozenset(mappings)
+ for entry in self._origin_intrinsic_metadata.get_all():
+ if entry['id'] < start or (end and entry['id'] > end):
+ continue
+ if nb_results >= limit:
+ return
+ if mappings is not None and mappings.isdisjoint(entry['mappings']):
+ continue
+ if ids_only:
+ yield entry['id']
+ else:
+ entry = entry.copy()
+ entry['origin_id'] = entry.pop('id')
+ yield entry
+ nb_results += 1
+
def origin_intrinsic_metadata_stats(self):
"""Returns statistics on stored intrinsic metadata.
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -324,9 +324,9 @@
'7026b7c1a2af56521e9587659012345678904321')
self.revision_id_3 = hash_to_bytes(
'7026b7c1a2af56521e9587659012345678904320')
- self.origin_id_1 = 54974445
+ self.origin_id_1 = 44434341
self.origin_id_2 = 44434342
- self.origin_id_3 = 44434341
+ self.origin_id_3 = 54974445
def test_check_config(self):
self.assertTrue(self.storage.check_config(check_write=True))
@@ -1173,8 +1173,7 @@
[res['origin_id'] for res in search(['John', 'Jane'])],
[self.origin_id_1])
- def test_origin_intrinsic_metadata_stats(self):
- # given
+ def _fill_origin_intrinsic_metadata(self):
tool_id = self.tools['swh-metadata-detector']['id']
metadata1 = {
@@ -1228,7 +1227,6 @@
'from_revision': self.revision_id_3,
}
- # when
self.storage.revision_metadata_add([metadata1_rev])
self.storage.origin_intrinsic_metadata_add([metadata1_origin])
self.storage.revision_metadata_add([metadata2_rev])
@@ -1236,7 +1234,66 @@
self.storage.revision_metadata_add([metadata3_rev])
self.storage.origin_intrinsic_metadata_add([metadata3_origin])
- # then
+ def test_origin_intrinsic_metadata_search_by_producer(self):
+ self._fill_origin_intrinsic_metadata()
+ tool = self.tools['swh-metadata-detector']
+ endpoint = self.storage.origin_intrinsic_metadata_search_by_producer
+
+ # test pagination
+ self.assertCountEqual(
+ endpoint(ids_only=True),
+ [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=0, ids_only=True),
+ [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=0, limit=2, ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(start=self.origin_id_1+1, ids_only=True),
+ [self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
+ ids_only=True),
+ [self.origin_id_2])
+
+ # test mappings filtering
+ self.assertCountEqual(
+ endpoint(mappings=['npm'], ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['npm', 'gemspec'], ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['gemspec'], ids_only=True),
+ [self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['pkg-info'], ids_only=True),
+ [self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(mappings=['foobar'], ids_only=True),
+ [])
+
+ # test pagination + mappings
+ self.assertCountEqual(
+ endpoint(mappings=['npm'], limit=1, ids_only=True),
+ [self.origin_id_1])
+
+ # test ids_only=False
+ self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
+ 'origin_id': self.origin_id_2,
+ 'metadata': {
+ '@context': 'foo',
+ 'author': 'Jane Doe',
+ },
+ 'mappings': ['npm', 'gemspec'],
+ 'tool': tool,
+ 'from_revision': self.revision_id_2,
+ }])
+
+ def test_origin_intrinsic_metadata_stats(self):
+ self._fill_origin_intrinsic_metadata()
+
result = self.storage.origin_intrinsic_metadata_stats()
self.assertEqual(result, {
'per_mapping': {

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 6:52 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221750

Event Timeline