Page MenuHomeSoftware Heritage

D235.id774.diff
No OneTemporary

D235.id774.diff

diff --git a/sql/json/revision_metadata.translated_metadata.json b/sql/json/revision_metadata.translated_metadata.json
new file mode 100644
--- /dev/null
+++ b/sql/json/revision_metadata.translated_metadata.json
@@ -0,0 +1,59 @@
+{
+ "$schema": "http://json-schema.org/schema#",
+ "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json",
+
+ "type": "object",
+ "properties": {
+ "developmentStatus": {
+ "type": "list"
+ },
+ "version": {
+ "type": "list"
+ },
+ "operatingSystem": {
+ "type": "list"
+ },
+ "description": {
+ "type": "list"
+ },
+ "keywords": {
+ "type": "list"
+ },
+ "issueTracker": {
+ "type": "list"
+ },
+ "name": {
+ "type": "list"
+ },
+ "author": {
+ "type": "list"
+ },
+ "relatedLink": {
+ "type": "list"
+ },
+ "url": {
+ "type": "list"
+ },
+ "type": {
+ "type": "list"
+ },
+ "license": {
+ "type": "list"
+ },
+ "maintainer": {
+ "type": "list"
+ },
+ "email": {
+ "type": "list"
+ },
+ "softwareRequirements": {
+ "type": "list"
+ },
+ "identifier": {
+ "type": "list"
+ },
+ "codeRepository": {
+ "type": "list"
+ },
+ }
+}
diff --git a/sql/swh-data.sql b/sql/swh-data.sql
--- a/sql/swh-data.sql
+++ b/sql/swh-data.sql
@@ -875,3 +875,6 @@
insert into indexer_configuration(tool_name, tool_version, tool_configuration)
values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}');
+
+insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}');
diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -1665,8 +1665,8 @@
as $$
begin
if conflict_update then
- insert into content_language (id, lang, indexer_configuration_id)
- select id, lang, indexer_configuration_id
+ insert into content_language (id, lang, indexer_configuration_id)
+ select id, lang, indexer_configuration_id
from tmp_content_language tcl
on conflict(id, indexer_configuration_id)
do update set lang = excluded.lang;
@@ -1674,7 +1674,7 @@
else
insert into content_language (id, lang, indexer_configuration_id)
select id, lang, indexer_configuration_id
- from tmp_content_language tcl
+ from tmp_content_language tcl
on conflict(id, indexer_configuration_id)
do nothing;
end if;
@@ -1995,8 +1995,8 @@
as $$
begin
if conflict_update then
- insert into content_metadata (id, translated_metadata, indexer_configuration_id)
- select id, translated_metadata, indexer_configuration_id
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
from tmp_content_metadata tcm
on conflict(id, indexer_configuration_id)
do update set translated_metadata = excluded.translated_metadata;
@@ -2056,6 +2056,117 @@
comment on function swh_content_metadata_get() is 'List content''s metadata';
-- end content_metadata functions
+-- revision_metadata functions
+--
+-- create a temporary table for revision_metadata tmp_revision_metadata,
+create or replace function swh_mktemp_revision_metadata_missing()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_revision_metadata_missing (
+ id sha1_git,
+ indexer_configuration_id integer
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata';
+
+-- check which entries of tmp_bytea are missing from revision_metadata
+--
+-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
+-- 2. call this function
+create or replace function swh_revision_metadata_missing()
+ returns setof sha1
+ language plpgsql
+as $$
+begin
+ return query
+ select id::sha1 from tmp_revision_metadata_missing as tmp
+ where not exists
+ (select 1 from revision_metadata as c
+ where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata';
+
+-- add tmp_revision_metadata entries to revision_metadata, overwriting
+-- duplicates if conflict_update is true, skipping duplicates otherwise.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_revision_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_revision_metadata, 2. call this function
+create or replace function swh_revision_metadata_add(conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ if conflict_update then
+ insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_revision_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set translated_metadata = excluded.translated_metadata;
+
+ else
+ insert into revision_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_revision_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata';
+
+-- create a temporary table for retrieving revision_metadata
+create or replace function swh_mktemp_revision_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_revision_metadata (
+ like revision_metadata including defaults
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata';
+
+--
+create type revision_metadata_signature as (
+ id sha1_git,
+ translated_metadata jsonb,
+ tool_id integer,
+ tool_name text,
+ tool_version text,
+ tool_configuration jsonb
+);
+
+-- Retrieve list of revision metadata from the temporary table.
+--
+-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
+create or replace function swh_revision_metadata_get()
+ returns setof revision_metadata_signature
+ language plpgsql
+as $$
+begin
+ return query
+ select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
+ from tmp_bytea t
+ inner join revision_metadata c on c.id = t.id
+ inner join indexer_configuration i on i.id=c.indexer_configuration_id;
+ return;
+end
+$$;
+
+comment on function swh_revision_metadata_get() is 'List revision''s metadata';
+-- end revision_metadata functions
+
-- simple counter mapping a textual label to an integer value
create type counter as (
label text,
diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql
--- a/sql/swh-indexes.sql
+++ b/sql/swh-indexes.sql
@@ -283,6 +283,13 @@
alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
--- origin_metadata_history and origin_metadata
--- TODO PK: origin_id, discovery_date
--- TODO FK: origin_id, indexer_configuration_id
+
+-- revision_metadata
+create unique index concurrently revision_metadata_pkey on revision_metadata(id, indexer_configuration_id);
+alter table revision_metadata add primary key using index revision_metadata_pkey;
+
+alter table revision_metadata add constraint revision_metadata_id_fkey foreign key (id) references revision(id) not valid;
+alter table revision_metadata validate constraint revision_metadata_id_fkey;
+
+alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey;
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -477,3 +477,17 @@
comment on column content_metadata.id is 'sha1 of content file';
comment on column content_metadata.translated_metadata is 'result of translation with defined format';
comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
+
+-- The table revision_metadata provides a minimal set of intrinsic metadata
+-- detected with the detection tool (indexer_configuration_id) and aggregated
+-- from the content_metadata translation.
+create table revision_metadata(
+ id sha1_git not null,
+ translated_metadata jsonb not null,
+ indexer_configuration_id bigint not null
+);
+
+comment on table revision_metadata is 'metadata semantically detected and translated in a revision';
+comment on column revision_metadata.id is 'sha1_git of revision';
+comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format';
+comment on column revision_metadata.indexer_configuration_id is 'tool used for detection';
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -243,5 +243,17 @@
def content_metadata_get(self, ids):
return self.post('content_metadata', {'ids': ids})
+ def revision_metadata_add(self, metadatas, conflict_update=False):
+ return self.post('revision_metadata/add', {
+ 'metadatas': metadatas,
+ 'conflict_update': conflict_update,
+ })
+
+ def revision_metadata_missing(self, metadatas):
+ return self.post('revision_metadata/missing', {'metadatas': metadatas})
+
+ def revision_metadata_get(self, ids):
+ return self.post('revision_metadata', {'ids': ids})
+
def indexer_configuration_get(self, tool):
return self.post('indexer_configuration/data', {'tool': tool})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -398,6 +398,24 @@
g.storage.content_metadata_get(**decode_request(request)))
+@app.route('/revision_metadata/add', methods=['POST'])
+def revision_metadata_add():
+ return encode_data(
+ g.storage.revision_metadata_add(**decode_request(request)))
+
+
+@app.route('/revision_metadata/missing', methods=['POST'])
+def revision_metadata_missing():
+ return encode_data(
+ g.storage.revision_metadata_missing(**decode_request(request)))
+
+
+@app.route('/revision_metadata', methods=['POST'])
+def revision_metadata_get():
+ return encode_data(
+ g.storage.revision_metadata_get(**decode_request(request)))
+
+
@app.route('/stat/counters', methods=['GET'])
def stat_counters():
return encode_data(g.storage.stat_counters())
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -996,6 +996,35 @@
cur.execute(query)
yield from cursor_to_bytes(cur)
+ revision_metadata_cols = [
+ 'id', 'translated_metadata',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
+
+ @stored_procedure('swh_mktemp_revision_metadata')
+ def mktemp_revision_metadata(self, cur=None): pass
+
+ @stored_procedure('swh_mktemp_revision_metadata_missing')
+ def mktemp_revision_metadata_missing(self, cur=None): pass
+
+ def revision_metadata_missing_from_temp(self, cur=None):
+ """List missing metadatas.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute("SELECT * FROM swh_revision_metadata_missing()")
+ yield from cursor_to_bytes(cur)
+
+ def revision_metadata_add_from_temp(self, conflict_update, cur=None):
+ self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)",
+ (conflict_update, ))
+
+ def revision_metadata_get_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ query = "SELECT %s FROM swh_revision_metadata_get()" % (
+ ','.join(self.revision_metadata_cols))
+ cur.execute(query)
+ yield from cursor_to_bytes(cur)
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1608,6 +1608,57 @@
cur)
db.content_metadata_add_from_temp(conflict_update, cur)
+ @db_transaction_generator
+ def revision_metadata_missing(self, metadatas, cur=None):
+ """List metadatas missing from storage.
+
+ Args:
+ metadatas: iterable of dict with keys:
+ - id (bytes): sha1_git revision identifier
+ - tool_name (str): tool used to compute the results
+ - tool_version (str): associated tool's version
+
+ Returns:
+ an iterable of missing id
+
+ """
+ db = self.db
+ db.mktemp_revision_metadata_missing(cur)
+ db.copy_to(metadatas, 'tmp_revision_metadata_missing',
+ ['id', 'indexer_configuration_id'], cur)
+ for obj in db.revision_metadata_missing_from_temp(cur):
+ yield obj[0]
+
+ @db_transaction_generator
+ def revision_metadata_get(self, ids, cur=None):
+ db = self.db
+ db.store_tmp_bytea(ids, cur)
+ for c in db.revision_metadata_get_from_temp():
+ yield converters.db_to_metadata(
+ dict(zip(db.revision_metadata_cols, c)))
+
+ @db_transaction
+ def revision_metadata_add(self, metadatas,
+ conflict_update=False, cur=None):
+ """Add metadatas not present in storage.
+
+ Args:
+ metadatas: iterable of dictionary with keys:
+ - id: sha1_git of revision
+ - translated_metadata: bytes / jsonb ?
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+ db = self.db
+ db.mktemp_revision_metadata(cur)
+ # empty metadata is mapped to 'unknown'
+
+ db.copy_to(metadatas, 'tmp_revision_metadata',
+ ['id', 'translated_metadata', 'indexer_configuration_id'],
+ cur)
+ db.revision_metadata_add_from_temp(conflict_update, cur)
+
@db_transaction
def indexer_configuration_get(self, tool, cur=None):
db = self.db
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -3129,6 +3129,247 @@
# metadata did change as the v2 was used to overwrite v1
self.assertEqual(actual_metadatas, expected_metadatas_v2)
+ @istest
+ def revision_metadata_missing(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+
+ rev = self.revision
+ missing_rev = self.revision2
+ self.storage.revision_add([rev])
+
+ metadatas = [
+ {
+ 'id': rev['id'],
+ 'indexer_configuration_id': tool_id,
+ },
+ {
+ 'id': missing_rev['id'],
+ 'indexer_configuration_id': tool_id,
+ }
+ ]
+
+ # when
+ actual_missing = list(self.storage.revision_metadata_missing(
+ metadatas))
+
+ # then
+ self.assertEqual(list(actual_missing), [
+ rev['id'],
+ missing_rev['id'],
+ ])
+
+ # given
+ self.storage.revision_metadata_add([{
+ 'id': rev['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id
+ }])
+
+ # when
+ actual_missing = list(self.storage.revision_metadata_missing(
+ metadatas))
+
+ # then
+ self.assertEqual(actual_missing, [missing_rev['id']])
+
+ @istest
+ def revision_metadata_get(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ rev = self.revision2
+ self.storage.revision_add([rev])
+
+ metadata_rev = {
+ 'id': rev['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id
+ }
+
+ # when
+ self.storage.revision_metadata_add([metadata_rev])
+
+ # then
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id'], self.revision['id']]))
+
+ expected_metadatas = [{
+ 'id': rev['id'],
+ 'translated_metadata': metadata_rev['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas)
+
+ @istest
+ def revision_metadata_add_drop_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ revision = self.revision
+ self.storage.revision_add([revision])
+
+ metadata_v1 = {
+ 'id': self.revision['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.revision_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision['id']]))
+
+ expected_metadatas_v1 = [{
+ 'id': self.revision['id'],
+ 'translated_metadata': metadata_v1['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'name': 'test_metadata',
+ 'author': 'MG',
+ },
+ })
+
+ self.storage.revision_metadata_add([metadata_v2])
+
+ # then
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision['id']]))
+
+ # metadata did not change as the v2 was dropped.
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ @istest
+ def revision_metadata_add_update_in_place_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-detector']['id']
+ revision = self.revision2
+ self.storage.revision_add([revision])
+
+ metadata_v1 = {
+ 'id': self.revision2['id'],
+ 'translated_metadata': {
+ 'developmentStatus': None,
+ 'version': None,
+ 'operatingSystem': None,
+ 'description': None,
+ 'keywords': None,
+ 'issueTracker': None,
+ 'name': None,
+ 'author': None,
+ 'relatedLink': None,
+ 'url': None,
+ 'type': None,
+ 'license': None,
+ 'maintainer': None,
+ 'email': None,
+ 'softwareRequirements': None,
+ 'identifier': None
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.revision_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id']]))
+
+ # then
+ expected_metadatas_v1 = [{
+ 'id': self.revision2['id'],
+ 'translated_metadata': metadata_v1['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'name': 'test_update_duplicated_metadata',
+ 'author': 'MG'
+ },
+ })
+ self.storage.revision_metadata_add([metadata_v2], conflict_update=True)
+
+ actual_metadatas = list(self.storage.revision_metadata_get(
+ [self.revision2['id']]))
+
+ # language did not change as the v2 was dropped.
+ expected_metadatas_v2 = [{
+ 'id': self.revision2['id'],
+ 'translated_metadata': metadata_v2['translated_metadata'],
+ 'tool': tools['swh-metadata-detector']
+ }]
+
+ # metadata did change as the v2 was used to overwrite v1
+ self.assertEqual(actual_metadatas, expected_metadatas_v2)
+
class TestLocalStorage(CommonTestStorage, unittest.TestCase):
"""Test the local storage"""

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 11:49 AM (18 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221708

Event Timeline