Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066633
D219.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
24 KB
Subscribers
None
D219.diff
View Options
diff --git a/sql/swh-data.sql b/sql/swh-data.sql
--- a/sql/swh-data.sql
+++ b/sql/swh-data.sql
@@ -872,3 +872,9 @@
insert into indexer_configuration(tool_name, tool_version, tool_configuration)
values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}');
+
+insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}');
+
+-- insert into indexer_configuration(tool_name, tool_version, tool_configuration)
+-- values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "maven"}');
diff --git a/sql/swh-func.sql b/sql/swh-func.sql
--- a/sql/swh-func.sql
+++ b/sql/swh-func.sql
@@ -1945,6 +1945,116 @@
comment on function swh_content_fossology_license_get() IS 'List content licenses';
+-- content_metadata functions
+--
+-- create a temporary table for content_metadata tmp_content_metadata,
+create or replace function swh_mktemp_content_metadata_missing()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_content_metadata_missing (
+ id sha1,
+ indexer_configuration_id integer
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata';
+
+-- check which entries of tmp_bytea are missing from content_metadata
+--
+-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea,
+-- 2. call this function
+create or replace function swh_content_metadata_missing()
+ returns setof sha1
+ language plpgsql
+as $$
+begin
+ return query
+ select id::sha1 from tmp_content_metadata_missing as tmp
+ where not exists
+ (select 1 from content_metadata as c
+ where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id);
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_missing() IS 'Filter missing content metadata';
+
+-- add tmp_content_metadata entries to content_metadata, overwriting
+-- duplicates if conflict_update is true, skipping duplicates otherwise.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_content_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_content_metadata, 2. call this function
+create or replace function swh_content_metadata_add(conflict_update boolean)
+ returns void
+ language plpgsql
+as $$
+begin
+ if conflict_update then
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do update set translated_metadata = excluded.translated_metadata;
+
+ else
+ insert into content_metadata (id, translated_metadata, indexer_configuration_id)
+ select id, translated_metadata, indexer_configuration_id
+ from tmp_content_metadata tcm
+ on conflict(id, indexer_configuration_id)
+ do nothing;
+ end if;
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata';
+
+-- create a temporary table for retrieving content_metadata
+create or replace function swh_mktemp_content_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table tmp_content_metadata (
+ like content_metadata including defaults
+ ) on commit drop;
+$$;
+
+comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata';
+
+--
+create type content_metadata_signature as (
+ id sha1,
+ translated_metadata jsonb,
+ tool_id integer,
+ tool_name text,
+ tool_version text,
+ tool_configuration jsonb
+);
+
+-- Retrieve list of content metadata from the temporary table.
+--
+-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function
+create or replace function swh_content_metadata_get()
+ returns setof content_metadata_signature
+ language plpgsql
+as $$
+begin
+ return query
+ select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration
+ from tmp_bytea t
+ inner join content_metadata c on c.id = t.id
+ inner join indexer_configuration i on i.id=c.indexer_configuration_id;
+ return;
+end
+$$;
+
+comment on function swh_content_metadata_get() is 'List content''s metadata';
+-- end content_metadata functions
-- simple counter mapping a textual label to an integer value
create type counter as (
diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql
--- a/sql/swh-indexes.sql
+++ b/sql/swh-indexes.sql
@@ -272,3 +272,17 @@
alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey;
+
+-- content_metadata
+create unique index concurrently content_metadata_pkey on content_metadata(id, indexer_configuration_id);
+alter table content_metadata add primary key using index content_metadata_pkey;
+
+alter table content_metadata add constraint content_metadata_id_fkey foreign key (id) references content(sha1) not valid;
+alter table content_metadata validate constraint content_metadata_id_fkey;
+
+alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey;
+
+-- origin_metadata_history and origin_metadata
+-- TODO PK: origin_id, discovery_date
+-- TODO FK: origin_id, indexer_configuration_id
diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql
--- a/sql/swh-schema.sql
+++ b/sql/swh-schema.sql
@@ -463,3 +463,54 @@
comment on column content_fossology_license.id is 'Raw content identifier';
comment on column content_fossology_license.license_id is 'One of the content''s license identifier';
comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information';
+
+
+-- The table content_metadata provides a translation to files
+-- identified as potentially containning metadata with a translation tool (indexer_configuration_id)
+create table content_metadata(
+ id sha1 not null,
+ translated_metadata jsonb not null,
+ indexer_configuration_id bigint not null
+);
+
+comment on table content_metadata is 'metadata semantically translated from a content file';
+comment on column content_metadata.id is 'sha1 of content file';
+comment on column content_metadata.translated_metadata is 'result of translation with defined format';
+comment on column content_metadata.indexer_configuration_id is 'tool used for translation';
+
+
+-- Discovery of metadata during a listing or a deposit of an origin
+-- also provides a translation to a defined json schema using a translation tool (indexer_configuration_id)
+create table origin_metadata_history(
+ origin_id bigint not null,
+ discovery_date timestamptz not null,
+ translation_date timestamptz,
+ provenance_type text not null, -- TODO use an enum (?)
+ raw_metadata jsonb not null,
+ translated_metadata jsonb,
+ indexer_configuration_id bigint,
+ object_id bigserial -- short object identifier
+);
+
+
+comment on table origin_metadata_history is 'keeps latest metadata concerning an origin';
+comment on column origin_metadata_history.origin_id is 'the origin id for which the metadata was found';
+comment on column origin_metadata_history.discovery_date is 'the date of retrieval';
+comment on column origin_metadata_history.translation_date is 'the date of translation';
+comment on column origin_metadata_history.provenance_type is 'lister, publisher, etc';
+comment on column origin_metadata_history.raw_metadata is 'metadata in json format but with original terms';
+comment on column origin_metadata_history.translated_metadata is 'metadata in defined terms in json schema';
+comment on column origin_metadata_history.indexer_configuration_id is 'tool used for translation';
+
+
+-- Materialized view of origin_metadata_history, storing the *current* value of
+-- metadata, as last seen by SWH.
+create table origin_metadata(
+ origin_id bigint not null,
+ discovery_date timestamptz not null,
+ translation_date timestamptz,
+ provenance_type text not null, -- TODO use an enum (?)
+ raw_metadata jsonb not null,
+ translated_metadata jsonb,
+ indexer_configuration_id bigint
+);
diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py
--- a/swh/storage/api/client.py
+++ b/swh/storage/api/client.py
@@ -231,5 +231,17 @@
def content_fossology_license_get(self, ids):
return self.post('content/fossology_license', {'ids': ids})
+ def content_metadata_add(self, metadatas, conflict_update=False):
+ return self.post('content_metadata/add', {
+ 'metadatas': metadatas,
+ 'conflict_update': conflict_update,
+ })
+
+ def content_metadata_missing(self, metadatas):
+ return self.post('content_metadata/missing', {'metadatas': metadatas})
+
+ def content_metadata_get(self, ids):
+ return self.post('content_metadata', {'ids': ids})
+
def indexer_configuration_get(self, tool):
return self.post('indexer_configuration/data', {'tool': tool})
diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py
--- a/swh/storage/api/server.py
+++ b/swh/storage/api/server.py
@@ -380,6 +380,24 @@
**decode_request(request)))
+@app.route('/content_metadata/add', methods=['POST'])
+def content_metadata_add():
+ return encode_data(
+ g.storage.content_metadata_add(**decode_request(request)))
+
+
+@app.route('/content_metadata/missing', methods=['POST'])
+def content_metadata_missing():
+ return encode_data(
+ g.storage.content_metadata_missing(**decode_request(request)))
+
+
+@app.route('/content_metadata', methods=['POST'])
+def content_metadata_get():
+ return encode_data(
+ g.storage.content_metadata_get(**decode_request(request)))
+
+
@app.route('/stat/counters', methods=['GET'])
def stat_counters():
return encode_data(g.storage.stat_counters())
diff --git a/swh/storage/converters.py b/swh/storage/converters.py
--- a/swh/storage/converters.py
+++ b/swh/storage/converters.py
@@ -416,6 +416,22 @@
}
+def db_to_metadata(metadata):
+ """Convert a metadata entry into a ready metadata output.
+
+ """
+ return {
+ 'id': metadata['id'],
+ 'translated_metadata': metadata['translated_metadata'],
+ 'tool': {
+ 'id': metadata['tool_id'],
+ 'name': metadata['tool_name'],
+ 'version': metadata['tool_version'],
+ 'configuration': metadata['tool_configuration']
+ }
+ }
+
+
def db_to_fossology_license(license):
return {
'id': license['id'],
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -967,6 +967,35 @@
cur.execute(query)
yield from cursor_to_bytes(cur)
+ content_metadata_cols = [
+ 'id', 'translated_metadata',
+ 'tool_id', 'tool_name', 'tool_version', 'tool_configuration']
+
+ @stored_procedure('swh_mktemp_content_metadata')
+ def mktemp_content_metadata(self, cur=None): pass
+
+ @stored_procedure('swh_mktemp_content_metadata_missing')
+ def mktemp_content_metadata_missing(self, cur=None): pass
+
+ def content_metadata_missing_from_temp(self, cur=None):
+ """List missing metadatas.
+
+ """
+ cur = self._cursor(cur)
+ cur.execute("SELECT * FROM swh_content_metadata_missing()")
+ yield from cursor_to_bytes(cur)
+
+ def content_metadata_add_from_temp(self, conflict_update, cur=None):
+ self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)",
+ (conflict_update, ))
+
+ def content_metadata_get_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ query = "SELECT %s FROM swh_content_metadata_get()" % (
+ ','.join(self.content_metadata_cols))
+ cur.execute(query)
+ yield from cursor_to_bytes(cur)
+
indexer_configuration_cols = ['id', 'tool_name', 'tool_version',
'tool_configuration']
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -1558,6 +1558,61 @@
cur=cur)
db.content_fossology_license_add_from_temp(conflict_update, cur)
+ @db_transaction_generator
+ def content_metadata_missing(self, metadatas, cur=None):
+ """List metadatas missing from storage.
+
+ Args:
+ metadatas: iterable of dict with keys:
+ - id (bytes): sha1 identifier
+ - tool_name (str): tool used to compute the results
+ - tool_version (str): associated tool's version
+
+ Returns:
+ an iterable of missing id
+
+ """
+ db = self.db
+ db.mktemp_content_metadata_missing(cur)
+ db.copy_to(metadatas, 'tmp_content_metadata_missing',
+ ['id', 'indexer_configuration_id'], cur)
+ for obj in db.content_metadata_missing_from_temp(cur):
+ yield obj[0]
+
+ @db_transaction_generator
+ def content_metadata_get(self, ids, cur=None):
+ db = self.db
+ db.store_tmp_bytea(ids, cur)
+ for c in db.content_metadata_get_from_temp():
+ yield converters.db_to_metadata(
+ dict(zip(db.content_metadata_cols, c)))
+
+ @db_transaction
+ def content_metadata_add(self, metadatas, conflict_update=False, cur=None):
+ """Add metadatas not present in storage.
+
+ Args:
+ metadatas: iterable of dictionary with keys:
+ - id: sha1
+ - translated_metadata: bytes / jsonb ?
+ conflict_update: Flag to determine if we want to overwrite (true)
+ or skip duplicates (false, the default)
+
+ """
+ db = self.db
+ db.mktemp_content_metadata(cur)
+ # empty metadata is mapped to 'unknown'
+ db.copy_to(
+ ({
+ 'id': m['id'],
+ 'translated_metadata': m['translated_metadata'],
+ 'indexer_configuration_id': m['indexer_configuration_id'],
+ } for m in metadatas),
+ 'tmp_content_metadata',
+ ['id', 'translated_metadata', 'indexer_configuration_id'], cur)
+
+ db.content_metadata_add_from_temp(conflict_update, cur)
+
@db_transaction
def indexer_configuration_get(self, tool, cur=None):
db = self.db
diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py
--- a/swh/storage/tests/test_converters.py
+++ b/swh/storage/tests/test_converters.py
@@ -284,3 +284,29 @@
actual_license = converters.db_to_fossology_license(input_license)
self.assertEquals(actual_license, expected_license)
+
+ @istest
+ def db_to_metadata(self):
+ input_metadata = {
+ 'id': b'some-id',
+ 'tool_id': 20,
+ 'tool_name': 'some-toolname',
+ 'tool_version': 'some-toolversion',
+ 'tool_configuration': {},
+ 'translated_metadata': b'translated_metadata',
+ }
+
+ expected_metadata = {
+ 'id': b'some-id',
+ 'translated_metadata': b'translated_metadata',
+ 'tool': {
+ 'id': 20,
+ 'name': 'some-toolname',
+ 'version': 'some-toolversion',
+ 'configuration': {},
+ }
+ }
+
+ actual_metadata = converters.db_to_metadata(input_metadata)
+
+ self.assertEquals(actual_metadata, expected_metadata)
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -2947,6 +2947,225 @@
})
self.assertEqual(actual_licenses, [expected_license])
+ @istest
+ def content_metadata_missing(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadatas = [
+ {
+ 'id': self.cont2['sha1'],
+ 'indexer_configuration_id': tool_id,
+ },
+ {
+ 'id': self.missing_cont['sha1'],
+ 'indexer_configuration_id': tool_id,
+ }
+ ]
+
+ # when
+ actual_missing = list(self.storage.content_metadata_missing(metadatas))
+
+ # then
+ self.assertEqual(list(actual_missing), [
+ self.cont2['sha1'],
+ self.missing_cont['sha1'],
+ ])
+
+ # given
+ self.storage.content_metadata_add([{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id
+ }])
+
+ # when
+ actual_missing = list(self.storage.content_metadata_missing(metadatas))
+
+ # then
+ self.assertEqual(actual_missing, [self.missing_cont['sha1']])
+
+ @istest
+ def content_metadata_get(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # when
+ self.storage.content_metadata_add([metadata1])
+
+ # then
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1'], self.missing_cont['sha1']]))
+
+ expected_metadatas = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'codeRepository': {
+ 'type': 'git',
+ 'url': 'https://github.com/moranegg/metadata_test'
+ },
+ 'description': 'Simple package.json test for indexer',
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas)
+
+ @istest
+ def content_metadata_add_drop_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata_v1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.content_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ expected_metadatas_v1 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_drop_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ })
+
+ self.storage.content_metadata_add([metadata_v2])
+
+ # then
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # metadata did not change as the v2 was dropped.
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ @istest
+ def content_metadata_add_update_in_place_duplicate(self):
+ # given
+ tools = self.fetch_tools()
+ tool_id = tools['swh-metadata-translator']['id']
+ cont2 = self.cont2
+ self.storage.content_add([cont2])
+
+ metadata_v1 = {
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'indexer_configuration_id': tool_id,
+ }
+
+ # given
+ self.storage.content_metadata_add([metadata_v1])
+
+ # when
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # then
+ expected_metadatas_v1 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+ self.assertEqual(actual_metadatas, expected_metadatas_v1)
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update({
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_update_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ })
+ self.storage.content_metadata_add([metadata_v2], conflict_update=True)
+
+ actual_metadatas = list(self.storage.content_metadata_get(
+ [self.cont2['sha1']]))
+
+ # language did not change as the v2 was dropped.
+ expected_metadatas_v2 = [{
+ 'id': self.cont2['sha1'],
+ 'translated_metadata': {
+ 'other': {},
+ 'name': 'test_update_duplicated_metadata',
+ 'version': '0.0.1'
+ },
+ 'tool': tools['swh-metadata-translator']
+ }]
+
+ # language did change as the v2 was used to overwrite v1
+ self.assertEqual(actual_metadatas, expected_metadatas_v2)
+
class TestLocalStorage(CommonTestStorage, unittest.TestCase):
"""Test the local storage"""
@@ -3040,6 +3259,33 @@
self.assertEqual(expected_tool, actual_tool)
+ @istest
+ def indexer_configuration_metadata_get_missing_context(self):
+ tool = {
+ 'tool_name': 'swh-metadata-translator',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {"context": "unknown-context"},
+ }
+
+ actual_tool = self.storage.indexer_configuration_get(tool)
+
+ self.assertIsNone(actual_tool)
+
+ @istest
+ def indexer_configuration_metadata_get(self):
+ tool = {
+ 'tool_name': 'swh-metadata-translator',
+ 'tool_version': '0.0.1',
+ 'tool_configuration': {"type": "local", "context": "npm"},
+ }
+
+ actual_tool = self.storage.indexer_configuration_get(tool)
+
+ expected_tool = tool.copy()
+ expected_tool['id'] = actual_tool['id']
+
+ self.assertEqual(expected_tool, actual_tool)
+
class AlteringSchemaTest(BaseTestStorage, unittest.TestCase):
"""This class is dedicated for the rare case where the schema needs to
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 4:33 PM (11 w, 16 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220384
Attached To
D219: Added content_metadata logic to the storage
Event Timeline
Log In to Comment