Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066482
D235.id774.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
22 KB
Subscribers
None
D235.id774.diff
View Options
diff --git a/sql/json/revision_metadata.translated_metadata.json b/sql/json/revision_metadata.translated_metadata.json
new file mode 100644
--- /dev/null
+++ b/sql/json/revision_metadata.translated_metadata.json
@@ -0,0 +1,59 @@
+{
+ "$schema": "http://json-schema.org/schema#",
+ "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json",
+
+ "type": "object",
+ "properties": {
+ "developmentStatus": {
+ "type": "list"
+ },
+ "version": {
+ "type": "list"
+ },
+ "operatingSystem": {
+ "type": "list"
+ },
+ "description": {
+ "type": "list"
+ },
+ "keywords": {
+ "type": "list"
+ },
+ "issueTracker": {
+ "type": "list"
+ },
+ "name": {
+ "type": "list"
+ },
+ "author": {
+ "type": "list"
+ },
+ "relatedLink": {
+ "type": "list"
+ },
+ "url": {
+ "type": "list"
+ },
+ "type": {
+ "type": "list"
+ },
+ "license": {
+ "type": "list"
+ },
+ "maintainer": {
+ "type": "list"
+ },
+ "email": {
+ "type": "list"
+ },
+ "softwareRequirements": {
+ "type": "list"
+ },
+ "identifier": {
+ "type": "list"
+ },
+ "codeRepository": {
+ "type": "list"
+ },
+ }
+}
diff --git a/sql/swh-data.sql b/sql/swh-data.sql
--- a/sql/swh-data.sql
+++ b/sql/swh-data.sql
@@ -875,3 +875,6 @@
insert into indexer_configuration(tool_name, tool_version, tool_configuration)
values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}');
+
+insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}');
diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -1665,8 +1665,8 @@
as $$
begin
if conflict_update then
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
from tmp_content_language tcl
on conflict(id, indexer_configuration_id)
do update set lang = excluded.lang;
@@ -1674,7 +1674,7 @@
else
insert into content_language (id, lang, indexer_configuration_id)
select id, lang, indexer_configuration_id
- from tmp_content_language tcl
+ from tmp_content_language tcl
on conflict(id, indexer_configuration_id)
do nothing;
end if;
@@ -1995,8 +1995,8 @@
as $$
begin
if conflict_update then
- insert into content_metadata (id, translated_metadata, indexer_configuration_id)
- select id, translated_metadata, indexer_configuration_id
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
from tmp_content_metadata tcm
on conflict(id, indexer_configuration_id)
do update set translated_metadata = excluded.translated_metadata;
@@ -2056,6 +2056,117 @@
comment on function swh_content_metadata_get() is 'List content''s metadata';
-- end content_metadata functions
+-- revision_metadata functions
+--
+-- create a temporary table for revision_metadata tmp_revision_metadata,
+create or replace function swh_mktemp_revision_metadata_missing()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_revision_metadata_missing (
+ id sha1_git,
+ indexer_configuration_id integer
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata';
+
+-- check which entries of tmp_bytea are missing from revision_metadata
+--
+-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
+-- 2. call this function
+create or replace function swh_revision_metadata_missing()
+ returns setof sha1
+ language plpgsql
+as $$
+begin
+ return query
+ select id::sha1 from tmp_revision_metadata_missing as tmp
+ where not exists
+ (select 1 from revision_metadata as c
+ where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata';
+
+-- add tmp_revision_metadata entries to revision_metadata, overwriting
+-- duplicates if conflict_update is true, skipping duplicates otherwise.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_revision_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_revision_metadata, 2. call this function
+create or replace function swh_revision_metadata_add(conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ if conflict_update then
+ insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_revision_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set translated_metadata = excluded.translated_metadata;
+
+ else
+ insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_revision_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata';
+
+-- create a temporary table for retrieving revision_metadata
+create or replace function swh_mktemp_revision_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_revision_metadata (
+ like revision_metadata including defaults
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata';
+
+--
+create type revision_metadata_signature as (
+ id sha1_git,
+ translated_metadata jsonb,
+ tool_id integer,
+ tool_name text,
+ tool_version text,
+ tool_configuration jsonb
+);
+
+-- Retrieve list of revision metadata from the temporary table.
+--
+-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
+create or replace function swh_revision_metadata_get()
+ returns setof revision_metadata_signature
+ language plpgsql
+as $$
+begin
+ return query
+ select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
+ from tmp_bytea t
+ inner join revision_metadata c on c.id = t.id
+ inner join indexer_configuration i on i.id=c.indexer_configuration_id;
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_get() is 'List revision''s metadata';
+-- end revision_metadata functions
+
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql
--- a/sql/swh-indexes.sql
+++ b/sql/swh-indexes.sql
@@ -283,6 +283,13 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- origin_metadata_history and origin_metadata
--- TODO PK: origin_id, discovery_date
--- TODO FK: origin_id, indexer_configuration_id
+
+-- revision_metadata
+create unique index concurrently revision_metadata_pkey on revision_metadata(id, indexer_configuration_id);
+alter table revision_metadata add primary key using index revision_metadata_pkey;
+
+alter table revision_metadata add constraint revision_metadata_id_fkey foreign key (id) references revision(id) not valid;
+alter table revision_metadata validate constraint revision_metadata_id_fkey;
+
+alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey;
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -477,3 +477,17 @@
comment on column content_metadata.id is 'sha1 of content file';
comment on column content_metadata.translated_metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
+
+-- The table revision_metadata provides a minimal set of intrinsic metadata
+-- detected with the detection tool (indexer_configuration_id) and aggregated
+-- from the content_metadata translation.
+create table revision_metadata(
+ id sha1_git not null,
+ translated_metadata jsonb not null,
+ indexer_configuration_id bigint not null
+);
+
+comment on table revision_metadata is 'metadata semantically detected and translated in a revision';
+comment on column revision_metadata.id is 'sha1_git of revision';
+comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format';
+comment on column revision_metadata.indexer_configuration_id is 'tool used for detection';
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -243,5 +243,17 @@
def content_metadata_get(self, ids):
return self.post('content_metadata', {'ids': ids})
+ def revision_metadata_add(self, metadatas, conflict_update=False):
+ return self.post('revision_metadata/add', {
+ 'metadatas': metadatas,
+ 'conflict_update': conflict_update,
+ })
+
+ def revision_metadata_missing(self, metadatas):
+ return self.post('revision_metadata/missing', {'metadatas': metadatas})
+
+ def revision_metadata_get(self, ids):
+ return self.post('revision_metadata', {'ids': ids})
+
def indexer_configuration_get(self, tool):
return self.post('indexer_configuration/data', {'tool': tool})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -398,6 +398,24 @@
g.storage.content_metadata_get(**decode_request(request)))
+@app.route('/revision_metadata/add', methods=['POST'])
+def revision_metadata_add():
+ return encode_data(
+ g.storage.revision_metadata_add(**decode_request(request)))
+
+
+@app.route('/revision_metadata/missing', methods=['POST'])
+def revision_metadata_missing():
+ return encode_data(
+ g.storage.revision_metadata_missing(**decode_request(request)))
+
+
+@app.route('/revision_metadata', methods=['POST'])
+def revision_metadata_get():
+ return encode_data(
+ g.storage.revision_metadata_get(**decode_request(request)))
+
+
@app.route('/stat/counters', methods=['GET'])
def stat_counters():
return encode_data(g.storage.stat_counters())
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -996,6 +996,35 @@
cur.execute(query)
yield from cursor_to_bytes(cur)
+ revision_metadata_cols = [
+ 'id', 'translated_metadata',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
+
+ @stored_procedure('swh_mktemp_revision_metadata')
+ def mktemp_revision_metadata(self, cur=None): pass
+
+ @stored_procedure('swh_mktemp_revision_metadata_missing')
+ def mktemp_revision_metadata_missing(self, cur=None): pass
+
+ def revision_metadata_missing_from_temp(self, cur=None):
+ """List missing metadatas.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute("SELECT * FROM swh_revision_metadata_missing()")
+ yield from cursor_to_bytes(cur)
+
+ def revision_metadata_add_from_temp(self, conflict_update, cur=None):
+ self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
+ (conflict_update, ))
+
+ def revision_metadata_get_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ query = "SELECT %s FROM swh_revision_metadata_get()" % (
+ ','.join(self.revision_metadata_cols))
+ cur.execute(query)
+ yield from cursor_to_bytes(cur)
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1608,6 +1608,57 @@
cur)
db.content_metadata_add_from_temp(conflict_update, cur)
+ @db_transaction_generator
+ def revision_metadata_missing(self, metadatas, cur=None):
+ """List metadatas missing from storage.
+
+ Args:
+ metadatas: iterable of dict with keys:
+ - id (bytes): sha1_git revision identifier
+ - tool_name (str): tool used to compute the results
+ - tool_version (str): associated tool's version
+
+ Returns:
+ an iterable of missing id
+
+ """
+ db = self.db
+ db.mktemp_revision_metadata_missing(cur)
+ db.copy_to(metadatas, 'tmp_revision_metadata_missing',
+ ['id', 'indexer_configuration_id'], cur)
+ for obj in db.revision_metadata_missing_from_temp(cur):
+ yield obj[0]
+
+ @db_transaction_generator
+ def revision_metadata_get(self, ids, cur=None):
+ db = self.db
+ db.store_tmp_bytea(ids, cur)
+ for c in db.revision_metadata_get_from_temp():
+ yield converters.db_to_metadata(
+ dict(zip(db.revision_metadata_cols, c)))
+
+ @db_transaction
+ def revision_metadata_add(self, metadatas,
+ conflict_update=False, cur=None):
+ """Add metadatas not present in storage.
+
+ Args:
+ metadatas: iterable of dictionary with keys:
+ - id: sha1_git of revision
+ - translated_metadata: bytes / jsonb ?
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+ db = self.db
+ db.mktemp_revision_metadata(cur)
+ # empty metadata is mapped to 'unknown'
+
+ db.copy_to(metadatas, 'tmp_revision_metadata',
+ ['id', 'translated_metadata', 'indexer_configuration_id'],
+ cur)
+ db.revision_metadata_add_from_temp(conflict_update, cur)
+
@db_transaction
def indexer_configuration_get(self, tool, cur=None):
db = self.db
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3129,6 +3129,247 @@
# metadata did change as the v2 was used to overwrite v1
self.assertEqual(actual_metadatas, expected_metadatas_v2)
+ @istest
+ def revision_metadata_missing(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+
+ rev = self.revision
+ missing_rev = self.revision2
+ self.storage.revision_add([rev])
+
+ metadatas = [
+ {
+ 'id': rev['id'],
+ 'indexer_configuration_id': tool_id,
+ },
+ {
+ 'id': missing_rev['id'],
+ 'indexer_configuration_id': tool_id,
+ }
+ ]
+
+ # when
+ actual_missing = list(self.storage.revision_metadata_missing(
+ metadatas))
+
+ # then
+ self.assertEqual(list(actual_missing), [
+ rev['id'],
+ missing_rev['id'],
+ ])
+
+ # given
+ self.storage.revision_metadata_add([{
+ 'id': rev['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id
+ }])
+
+ # when
+ actual_missing = list(self.storage.revision_metadata_missing(
+ metadatas))
+
+ # then
+ self.assertEqual(actual_missing, [missing_rev['id']])
+
+ @istest
+ def revision_metadata_get(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ rev = self.revision2
+ self.storage.revision_add([rev])
+
+ metadata_rev = {
+ 'id': rev['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id
+ }
+
+ # when
+ self.storage.revision_metadata_add([metadata_rev])
+
+ # then
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id'], self.revision['id']]))
+
+ expected_metadatas = [{
+ 'id': rev['id'],
+ 'translated_metadata': metadata_rev['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas)
+
+ @istest
+ def revision_metadata_add_drop_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ revision = self.revision
+ self.storage.revision_add([revision])
+
+ metadata_v1 = {
+ 'id': self.revision['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.revision_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision['id']]))
+
+ expected_metadatas_v1 = [{
+ 'id': self.revision['id'],
+ 'translated_metadata': metadata_v1['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'name': 'test_metadata',
+ 'author': 'MG',
+ },
+ })
+
+ self.storage.revision_metadata_add([metadata_v2])
+
+ # then
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision['id']]))
+
+ # metadata did not change as the v2 was dropped.
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ @istest
+ def revision_metadata_add_update_in_place_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ revision = self.revision2
+ self.storage.revision_add([revision])
+
+ metadata_v1 = {
+ 'id': self.revision2['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.revision_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id']]))
+
+ # then
+ expected_metadatas_v1 = [{
+ 'id': self.revision2['id'],
+ 'translated_metadata': metadata_v1['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'name': 'test_update_duplicated_metadata',
+ 'author': 'MG'
+ },
+ })
+ self.storage.revision_metadata_add([metadata_v2], conflict_update=True)
+
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id']]))
+
+ # language did not change as the v2 was dropped.
+ expected_metadatas_v2 = [{
+ 'id': self.revision2['id'],
+ 'translated_metadata': metadata_v2['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ # metadata did change as the v2 was used to overwrite v1
+ self.assertEqual(actual_metadatas, expected_metadatas_v2)
+
class TestLocalStorage(CommonTestStorage, unittest.TestCase):
"""Test the local storage"""
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 11:49 AM (18 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221708
Attached To
D235: Added revision_metadata table and methods into storage
Event Timeline
Log In to Comment