Page MenuHomeSoftware Heritage

D219.diff
No OneTemporary

D219.diff

diff --git a/sql/swh-data.sql b/sql/swh-data.sql
--- a/sql/swh-data.sql
+++ b/sql/swh-data.sql
@@ -872,3 +872,9 @@
insert into indexer_configuration(tool_name, tool_version, tool_configuration)
values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}');
+
+insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}');
+
+-- insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+-- values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "maven"}');
diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -1945,6 +1945,116 @@
comment on function swh_content_fossology_license_get() IS 'List content licenses';
+-- content_metadata functions
+--
+-- create a temporary table for content_metadata tmp_content_metadata,
+create or replace function swh_mktemp_content_metadata_missing()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_content_metadata_missing (
+ id sha1,
+ indexer_configuration_id integer
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata';
+
+-- check which entries of tmp_bytea are missing from content_metadata
+--
+-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
+-- 2. call this function
+create or replace function swh_content_metadata_missing()
+ returns setof sha1
+ language plpgsql
+as $$
+begin
+ return query
+ select id::sha1 from tmp_content_metadata_missing as tmp
+ where not exists
+ (select 1 from content_metadata as c
+ where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_missing() IS 'Filter missing content metadata';
+
+-- add tmp_content_metadata entries to content_metadata, overwriting
+-- duplicates if conflict_update is true, skipping duplicates otherwise.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_content_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_content_metadata, 2. call this function
+create or replace function swh_content_metadata_add(conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ if conflict_update then
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set translated_metadata = excluded.translated_metadata;
+
+ else
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
+
+-- create a temporary table for retrieving content_metadata
+create or replace function swh_mktemp_content_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_content_metadata (
+ like content_metadata including defaults
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
+
+--
+create type content_metadata_signature as (
+ id sha1,
+ translated_metadata jsonb,
+ tool_id integer,
+ tool_name text,
+ tool_version text,
+ tool_configuration jsonb
+);
+
+-- Retrieve list of content metadata from the temporary table.
+--
+-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
+create or replace function swh_content_metadata_get()
+ returns setof content_metadata_signature
+ language plpgsql
+as $$
+begin
+ return query
+ select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
+ from tmp_bytea t
+ inner join content_metadata c on c.id = t.id
+ inner join indexer_configuration i on i.id=c.indexer_configuration_id;
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_get() is 'List content''s metadata';
+-- end content_metadata functions
-- simple counter mapping a textual label to an integer value
create type counter as (
diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql
--- a/sql/swh-indexes.sql
+++ b/sql/swh-indexes.sql
@@ -272,3 +272,17 @@
alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey;
+
+-- content_metadata
+create unique index concurrently content_metadata_pkey on content_metadata(id, indexer_configuration_id);
+alter table content_metadata add primary key using index content_metadata_pkey;
+
+alter table content_metadata add constraint content_metadata_id_fkey foreign key (id) references content(sha1) not valid;
+alter table content_metadata validate constraint content_metadata_id_fkey;
+
+alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
+
+-- origin_metadata_history and origin_metadata
+-- TODO PK: origin_id, discovery_date
+-- TODO FK: origin_id, indexer_configuration_id
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -463,3 +463,54 @@
comment on column content_fossology_license.id is 'Raw content identifier';
comment on column content_fossology_license.license_id is 'One of the content''s license identifier';
comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information';
+
+
+-- The table content_metadata provides a translation to files
+-- identified as potentially containning metadata with a translation tool (indexer_configuration_id)
+create table content_metadata(
+ id sha1 not null,
+ translated_metadata jsonb not null,
+ indexer_configuration_id bigint not null
+);
+
+comment on table content_metadata is 'metadata semantically translated from a content file';
+comment on column content_metadata.id is 'sha1 of content file';
+comment on column content_metadata.translated_metadata is 'result of translation with defined format';
+comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
+
+
+-- Discovery of metadata during a listing or a deposit of an origin
+-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id)
+create table origin_metadata_history(
+ origin_id bigint not null,
+ discovery_date timestamptz not null,
+ translation_date timestamptz,
+ provenance_type text not null, -- TODO use an enum (?)
+ raw_metadata jsonb not null,
+ translated_metadata jsonb,
+ indexer_configuration_id bigint,
+ object_id bigserial -- short object identifier
+);
+
+
+comment on table origin_metadata_history is 'keeps latest metadata concerning an origin';
+comment on column origin_metadata_history.origin_id is 'the origin id for which the metadata was found';
+comment on column origin_metadata_history.discovery_date is 'the date of retrieval';
+comment on column origin_metadata_history.translation_date is 'the date of translation';
+comment on column origin_metadata_history.provenance_type is 'lister, publisher, etc';
+comment on column origin_metadata_history.raw_metadata is 'metadata in json format but with original terms';
+comment on column origin_metadata_history.translated_metadata is 'metadata in defined terms in json schema';
+comment on column origin_metadata_history.indexer_configuration_id is 'tool used for translation';
+
+
+-- Materialized view of origin_metadata_history, storing the *current* value of
+-- metadata, as last seen by SWH.
+create table origin_metadata(
+ origin_id bigint not null,
+ discovery_date timestamptz not null,
+ translation_date timestamptz,
+ provenance_type text not null, -- TODO use an enum (?)
+ raw_metadata jsonb not null,
+ translated_metadata jsonb,
+ indexer_configuration_id bigint
+);
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -231,5 +231,17 @@
def content_fossology_license_get(self, ids):
return self.post('content/fossology_license', {'ids': ids})
+ def content_metadata_add(self, metadatas, conflict_update=False):
+ return self.post('content_metadata/add', {
+ 'metadatas': metadatas,
+ 'conflict_update': conflict_update,
+ })
+
+ def content_metadata_missing(self, metadatas):
+ return self.post('content_metadata/missing', {'metadatas': metadatas})
+
+ def content_metadata_get(self, ids):
+ return self.post('content_metadata', {'ids': ids})
+
def indexer_configuration_get(self, tool):
return self.post('indexer_configuration/data', {'tool': tool})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -380,6 +380,24 @@
**decode_request(request)))
+@app.route('/content_metadata/add', methods=['POST'])
+def content_metadata_add():
+ return encode_data(
+ g.storage.content_metadata_add(**decode_request(request)))
+
+
+@app.route('/content_metadata/missing', methods=['POST'])
+def content_metadata_missing():
+ return encode_data(
+ g.storage.content_metadata_missing(**decode_request(request)))
+
+
+@app.route('/content_metadata', methods=['POST'])
+def content_metadata_get():
+ return encode_data(
+ g.storage.content_metadata_get(**decode_request(request)))
+
+
@app.route('/stat/counters', methods=['GET'])
def stat_counters():
return encode_data(g.storage.stat_counters())
diff --git a/swh/storage/converters.py b/swh/storage/converters.py
--- a/swh/storage/converters.py
+++ b/swh/storage/converters.py
@@ -416,6 +416,22 @@
}
+def db_to_metadata(metadata):
+ """Convert a metadata entry into a ready metadata output.
+
+ """
+ return {
+ 'id': metadata['id'],
+ 'translated_metadata': metadata['translated_metadata'],
+ 'tool': {
+ 'id': metadata['tool_id'],
+ 'name': metadata['tool_name'],
+ 'version': metadata['tool_version'],
+ 'configuration': metadata['tool_configuration']
+ }
+ }
+
+
def db_to_fossology_license(license):
return {
'id': license['id'],
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -967,6 +967,35 @@
cur.execute(query)
yield from cursor_to_bytes(cur)
+ content_metadata_cols = [
+ 'id', 'translated_metadata',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
+
+ @stored_procedure('swh_mktemp_content_metadata')
+ def mktemp_content_metadata(self, cur=None): pass
+
+ @stored_procedure('swh_mktemp_content_metadata_missing')
+ def mktemp_content_metadata_missing(self, cur=None): pass
+
+ def content_metadata_missing_from_temp(self, cur=None):
+ """List missing metadatas.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute("SELECT * FROM swh_content_metadata_missing()")
+ yield from cursor_to_bytes(cur)
+
+ def content_metadata_add_from_temp(self, conflict_update, cur=None):
+ self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
+ (conflict_update, ))
+
+ def content_metadata_get_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ query = "SELECT %s FROM swh_content_metadata_get()" % (
+ ','.join(self.content_metadata_cols))
+ cur.execute(query)
+ yield from cursor_to_bytes(cur)
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1558,6 +1558,61 @@
cur=cur)
db.content_fossology_license_add_from_temp(conflict_update, cur)
+ @db_transaction_generator
+ def content_metadata_missing(self, metadatas, cur=None):
+ """List metadatas missing from storage.
+
+ Args:
+ metadatas: iterable of dict with keys:
+ - id (bytes): sha1 identifier
+ - tool_name (str): tool used to compute the results
+ - tool_version (str): associated tool's version
+
+ Returns:
+ an iterable of missing id
+
+ """
+ db = self.db
+ db.mktemp_content_metadata_missing(cur)
+ db.copy_to(metadatas, 'tmp_content_metadata_missing',
+ ['id', 'indexer_configuration_id'], cur)
+ for obj in db.content_metadata_missing_from_temp(cur):
+ yield obj[0]
+
+ @db_transaction_generator
+ def content_metadata_get(self, ids, cur=None):
+ db = self.db
+ db.store_tmp_bytea(ids, cur)
+ for c in db.content_metadata_get_from_temp():
+ yield converters.db_to_metadata(
+ dict(zip(db.content_metadata_cols, c)))
+
+ @db_transaction
+ def content_metadata_add(self, metadatas, conflict_update=False, cur=None):
+ """Add metadatas not present in storage.
+
+ Args:
+ metadatas: iterable of dictionary with keys:
+ - id: sha1
+ - translated_metadata: bytes / jsonb ?
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+ db = self.db
+ db.mktemp_content_metadata(cur)
+ # empty metadata is mapped to 'unknown'
+ db.copy_to(
+ ({
+ 'id': m['id'],
+ 'translated_metadata': m['translated_metadata'],
+ 'indexer_configuration_id': m['indexer_configuration_id'],
+ } for m in metadatas),
+ 'tmp_content_metadata',
+ ['id', 'translated_metadata', 'indexer_configuration_id'], cur)
+
+ db.content_metadata_add_from_temp(conflict_update, cur)
+
@db_transaction
def indexer_configuration_get(self, tool, cur=None):
db = self.db
diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py
--- a/swh/storage/tests/test_converters.py
+++ b/swh/storage/tests/test_converters.py
@@ -284,3 +284,29 @@
actual_license = converters.db_to_fossology_license(input_license)
self.assertEquals(actual_license, expected_license)
+
+ @istest
+ def db_to_metadata(self):
+ input_metadata = {
+ 'id': b'some-id',
+ 'tool_id': 20,
+ 'tool_name': 'some-toolname',
+ 'tool_version': 'some-toolversion',
+ 'tool_configuration': {},
+ 'translated_metadata': b'translated_metadata',
+ }
+
+ expected_metadata = {
+ 'id': b'some-id',
+ 'translated_metadata': b'translated_metadata',
+ 'tool': {
+ 'id': 20,
+ 'name': 'some-toolname',
+ 'version': 'some-toolversion',
+ 'configuration': {},
+ }
+ }
+
+ actual_metadata = converters.db_to_metadata(input_metadata)
+
+ self.assertEquals(actual_metadata, expected_metadata)
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -2947,6 +2947,225 @@
})
self.assertEqual(actual_licenses, [expected_license])
+ @istest
+ def content_metadata_missing(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadatas = [
+ {
+ 'id': self.cont2['sha1'],
+ 'indexer_configuration_id': tool_id,
+ },
+ {
+ 'id': self.missing_cont['sha1'],
+ 'indexer_configuration_id': tool_id,
+ }
+ ]
+
+ # when
+ actual_missing = list(self.storage.content_metadata_missing(metadatas))
+
+ # then
+ self.assertEqual(list(actual_missing), [
+ self.cont2['sha1'],
+ self.missing_cont['sha1'],
+ ])
+
+ # given
+ self.storage.content_metadata_add([{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id
+ }])
+
+ # when
+ actual_missing = list(self.storage.content_metadata_missing(metadatas))
+
+ # then
+ self.assertEqual(actual_missing, [self.missing_cont['sha1']])
+
+ @istest
+ def content_metadata_get(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # when
+ self.storage.content_metadata_add([metadata1])
+
+ # then
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1'], self.missing_cont['sha1']]))
+
+ expected_metadatas = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas)
+
+ @istest
+ def content_metadata_add_drop_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata_v1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.content_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ expected_metadatas_v1 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_drop_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ })
+
+ self.storage.content_metadata_add([metadata_v2])
+
+ # then
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # metadata did not change as the v2 was dropped.
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ @istest
+ def content_metadata_add_update_in_place_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata_v1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.content_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # then
+ expected_metadatas_v1 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_update_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ })
+ self.storage.content_metadata_add([metadata_v2], conflict_update=True)
+
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # language did not change as the v2 was dropped.
+ expected_metadatas_v2 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_update_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ # language did change as the v2 was used to overwrite v1
+ self.assertEqual(actual_metadatas, expected_metadatas_v2)
+
class TestLocalStorage(CommonTestStorage, unittest.TestCase):
"""Test the local storage"""
@@ -3040,6 +3259,33 @@
self.assertEqual(expected_tool, actual_tool)
+ @istest
+ def indexer_configuration_metadata_get_missing_context(self):
+ tool = {
+ 'tool_name': 'swh-metadata-translator',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {"context": "unknown-context"},
+ }
+
+ actual_tool = self.storage.indexer_configuration_get(tool)
+
+ self.assertIsNone(actual_tool)
+
+ @istest
+ def indexer_configuration_metadata_get(self):
+ tool = {
+ 'tool_name': 'swh-metadata-translator',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {"type": "local", "context": "npm"},
+ }
+
+ actual_tool = self.storage.indexer_configuration_get(tool)
+
+ expected_tool = tool.copy()
+ expected_tool['id'] = actual_tool['id']
+
+ self.assertEqual(expected_tool, actual_tool)
+
class AlteringSchemaTest(BaseTestStorage, unittest.TestCase):
"""This class is dedicated for the rare case where the schema needs to

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 4:33 PM (11 w, 16 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220384

Event Timeline