Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696089
D1150.id3623.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D1150.id3623.diff
View Options
diff --git a/sql/upgrades/122.sql b/sql/upgrades/122.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/122.sql
@@ -0,0 +1,9 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 121
+-- to_version: 122
+-- description: add index to search origin_intrinsic_metadata for mappings.
+
+insert into dbversion(version, release, description)
+values(122, now(), 'Work In Progress');
+
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/sql/30-swh-schema.sql b/swh/indexer/sql/30-swh-schema.sql
--- a/swh/indexer/sql/30-swh-schema.sql
+++ b/swh/indexer/sql/30-swh-schema.sql
@@ -14,7 +14,7 @@
);
insert into dbversion(version, release, description)
- values(121, now(), 'Work In Progress');
+ values(122, now(), 'Work In Progress');
-- Computing metadata on sha1's contents
-- a SHA1 checksum (not necessarily originating from Git)
diff --git a/swh/indexer/sql/60-swh-indexes.sql b/swh/indexer/sql/60-swh-indexes.sql
--- a/swh/indexer/sql/60-swh-indexes.sql
+++ b/swh/indexer/sql/60-swh-indexes.sql
@@ -66,3 +66,4 @@
alter table origin_intrinsic_metadata validate constraint origin_intrinsic_metadata_revision_metadata_fkey;
create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
+create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -739,6 +739,44 @@
yield converters.db_to_metadata(
dict(zip(db.origin_intrinsic_metadata_cols, c)))
+ @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
+ @db_transaction_generator()
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start=0, end=None, limit=100, ids_only=False,
+ mappings=None,
+ db=None, cur=None):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ start (int): The minimum origin id to return
+ end (int): The maximum origin id to return
+ limit (int): The maximum number of results to return
+ ids_only (bool): Determines whether only origin ids are returned
+ or the content as well
+ mappings (List[str]): Returns origins whose intrinsic metadata
+ were generated using at least one of these mappings.
+
+ Yields:
+ list: list of origin ids (int) if `ids_only=True`, else
+ dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
+
+ """
+ res = db.origin_intrinsic_metadata_search_by_producer(
+ start, end, limit, ids_only, mappings, cur)
+ if ids_only:
+ for (origin_id,) in res:
+ yield origin_id
+ else:
+ for c in res:
+ yield converters.db_to_metadata(
+ dict(zip(db.origin_intrinsic_metadata_cols, c)))
+
@remote_api_endpoint('origin_intrinsic_metadata/stats')
@db_transaction()
def origin_intrinsic_metadata_stats(
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -348,8 +348,7 @@
self.origin_intrinsic_metadata_cols, cur=cur,
id_col='origin_id')
- def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit,
- cur=None):
+ def origin_intrinsic_metadata_search_fulltext(self, terms, *, limit, cur):
regconfig = self.origin_intrinsic_metadata_regconfig
tsquery_template = ' && '.join("plainto_tsquery('%s', %%s)" % regconfig
for _ in terms)
@@ -368,6 +367,42 @@
cur.execute(query, tsquery_args + [limit])
yield from cur
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start, end, limit, ids_only, mappings, cur):
+ if ids_only:
+ keys = 'oim.origin_id'
+ else:
+ keys = ', '.join(map(self._convert_key,
+ self.origin_intrinsic_metadata_cols))
+ query_parts = [
+ "SELECT %s" % keys,
+ "FROM origin_intrinsic_metadata AS oim",
+ "INNER JOIN indexer_configuration AS i",
+ "ON oim.indexer_configuration_id=i.id",
+ ]
+ args = []
+
+ where = []
+ if start:
+ where.append('oim.origin_id >= %s')
+ args.append(start)
+ if end:
+ where.append('oim.origin_id <= %s')
+ args.append(end)
+ if mappings is not None:
+ where.append('oim.mappings && %s')
+ args.append(mappings)
+ if where:
+ query_parts.append('WHERE')
+ query_parts.append(' AND '.join(where))
+
+ if limit:
+ query_parts.append('LIMIT %s')
+ args.append(limit)
+
+ cur.execute(' '.join(query_parts), args)
+ yield from cur
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -680,6 +680,50 @@
result['origin_id'] = result.pop('id')
yield result
+ def origin_intrinsic_metadata_search_by_producer(
+ self, start=0, end=None, limit=100, ids_only=False,
+ mappings=None,
+ db=None, cur=None):
+ """Returns the list of origins whose metadata contain all the terms.
+
+ Args:
+ start (int): The minimum origin id to return
+ end (int): The maximum origin id to return
+ limit (int): The maximum number of results to return
+ ids_only (bool): Determines whether only origin ids are returned
+ or the content as well
+ mappings (List[str]): Returns origins whose intrinsic metadata
+ were generated using at least one of these mappings.
+
+ Yields:
+ list: list of origin ids (int) if `ids_only=True`, else
+ dictionaries with the following keys:
+
+ - **id** (int)
+ - **metadata** (str): associated metadata
+ - **tool** (dict): tool used to compute metadata
+ - **mappings** (List[str]): list of mappings used to translate
+ these metadata
+
+ """
+ nb_results = 0
+ if mappings is not None:
+ mappings = frozenset(mappings)
+ for entry in self._origin_intrinsic_metadata.get_all():
+ if entry['id'] < start or (end and entry['id'] > end):
+ continue
+ if nb_results >= limit:
+ return
+ if mappings is not None and mappings.isdisjoint(entry['mappings']):
+ continue
+ if ids_only:
+ yield entry['id']
+ else:
+ entry = entry.copy()
+ entry['origin_id'] = entry.pop('id')
+ yield entry
+ nb_results += 1
+
def origin_intrinsic_metadata_stats(self):
"""Returns statistics on stored intrinsic metadata.
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -324,9 +324,9 @@
'7026b7c1a2af56521e9587659012345678904321')
self.revision_id_3 = hash_to_bytes(
'7026b7c1a2af56521e9587659012345678904320')
- self.origin_id_1 = 54974445
+ self.origin_id_1 = 44434341
self.origin_id_2 = 44434342
- self.origin_id_3 = 44434341
+ self.origin_id_3 = 54974445
def test_check_config(self):
self.assertTrue(self.storage.check_config(check_write=True))
@@ -1173,8 +1173,7 @@
[res['origin_id'] for res in search(['John', 'Jane'])],
[self.origin_id_1])
- def test_origin_intrinsic_metadata_stats(self):
- # given
+ def _fill_origin_intrinsic_metadata(self):
tool_id = self.tools['swh-metadata-detector']['id']
metadata1 = {
@@ -1228,7 +1227,6 @@
'from_revision': self.revision_id_3,
}
- # when
self.storage.revision_metadata_add([metadata1_rev])
self.storage.origin_intrinsic_metadata_add([metadata1_origin])
self.storage.revision_metadata_add([metadata2_rev])
@@ -1236,7 +1234,66 @@
self.storage.revision_metadata_add([metadata3_rev])
self.storage.origin_intrinsic_metadata_add([metadata3_origin])
- # then
+ def test_origin_intrinsic_metadata_search_by_producer(self):
+ self._fill_origin_intrinsic_metadata()
+ tool = self.tools['swh-metadata-detector']
+ endpoint = self.storage.origin_intrinsic_metadata_search_by_producer
+
+ # test pagination
+ self.assertCountEqual(
+ endpoint(ids_only=True),
+ [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=0, ids_only=True),
+ [self.origin_id_1, self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=0, limit=2, ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(start=self.origin_id_1+1, ids_only=True),
+ [self.origin_id_2, self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(start=self.origin_id_1+1, end=self.origin_id_3-1,
+ ids_only=True),
+ [self.origin_id_2])
+
+ # test mappings filtering
+ self.assertCountEqual(
+ endpoint(mappings=['npm'], ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['npm', 'gemspec'], ids_only=True),
+ [self.origin_id_1, self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['gemspec'], ids_only=True),
+ [self.origin_id_2])
+ self.assertCountEqual(
+ endpoint(mappings=['pkg-info'], ids_only=True),
+ [self.origin_id_3])
+ self.assertCountEqual(
+ endpoint(mappings=['foobar'], ids_only=True),
+ [])
+
+ # test pagination + mappings
+ self.assertCountEqual(
+ endpoint(mappings=['npm'], limit=1, ids_only=True),
+ [self.origin_id_1])
+
+ # test ids_only=False
+ self.assertEqual(list(endpoint(mappings=['gemspec'])), [{
+ 'origin_id': self.origin_id_2,
+ 'metadata': {
+ '@context': 'foo',
+ 'author': 'Jane Doe',
+ },
+ 'mappings': ['npm', 'gemspec'],
+ 'tool': tool,
+ 'from_revision': self.revision_id_2,
+ }])
+
+ def test_origin_intrinsic_metadata_stats(self):
+ self._fill_origin_intrinsic_metadata()
+
result = self.storage.origin_intrinsic_metadata_stats()
self.assertEqual(result, {
'per_mapping': {
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 6:52 PM (1 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221750
Attached To
D1150: Add idx storage endpoint to search metadata by mapping.
Event Timeline
Log In to Comment