diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql --- a/swh/indexer/sql/30-schema.sql +++ b/swh/indexer/sql/30-schema.sql @@ -130,3 +130,19 @@ comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.'; comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)'; + +create table origin_extrinsic_metadata( + id text not null, -- origin url + metadata jsonb, + indexer_configuration_id bigint not null, + from_remd_id sha1_git not null, + metadata_tsvector tsvector, + mappings text array not null +); + +comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin'; +comment on column origin_extrinsic_metadata.id is 'url of the origin'; +comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory'; +comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.'; +comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)'; diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql --- a/swh/indexer/sql/50-func.sql +++ b/swh/indexer/sql/50-func.sql @@ -389,6 +389,79 @@ end $$; +-- create a temporary table for retrieving origin_extrinsic_metadata +create or replace function swh_mktemp_origin_extrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_origin_extrinsic_metadata ( + like origin_extrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata'; + +create or replace function swh_mktemp_indexer_configuration() + returns void + language sql +as $$ + create temporary table if not exists tmp_indexer_configuration ( + like indexer_configuration including defaults + ) on commit delete rows; + alter table tmp_indexer_configuration drop column if exists id; +$$; + +-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_extrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_extrinsic_metadata, 2. call this function +create or replace function swh_origin_extrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_extrinsic_metadata_compute_tsvector(); + + insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_remd_id, + metadata_tsvector, mappings + from tmp_origin_extrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_remd_id = excluded.from_remd_id; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata'; + + +-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata. +-- +-- It uses the "pg_catalog.simple" dictionary, as it has no stopword, +-- so it should be suitable for proper names and non-English text. +create or replace function swh_origin_extrinsic_metadata_compute_tsvector() + returns void + language plpgsql +as $$ +begin + update tmp_origin_extrinsic_metadata + set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); +end +$$; + -- add tmp_indexer_configuration entries to indexer_configuration, -- overwriting duplicates if any. diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql --- a/swh/indexer/sql/60-indexes.sql +++ b/swh/indexer/sql/60-indexes.sql @@ -67,3 +67,13 @@ create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector); create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings); + +-- origin_extrinsic_metadata +create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id); +alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey; + +alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey; + +create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector); +create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings); diff --git a/swh/indexer/sql/upgrades/135.sql b/swh/indexer/sql/upgrades/135.sql new file mode 100644 --- /dev/null +++ b/swh/indexer/sql/upgrades/135.sql @@ -0,0 +1,106 @@ +-- SWH Indexer DB schema upgrade +-- from_version: 134 +-- to_version: 135 +-- description: Add support for origin_extrinsic_metadata + +insert into dbversion(version, release, description) + values(135, now(), 'Work In Progress'); + +create table origin_extrinsic_metadata( + id text not null, -- origin url + metadata jsonb, + indexer_configuration_id bigint not null, + from_remd_id sha1_git not null, + metadata_tsvector tsvector, + mappings text array not null +); + +comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin'; +comment on column origin_extrinsic_metadata.id is 'url of the origin'; +comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory'; +comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata'; +comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.'; +comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)'; + +-- create a temporary table for retrieving origin_extrinsic_metadata +create or replace function swh_mktemp_origin_extrinsic_metadata() + returns void + language sql +as $$ + create temporary table if not exists tmp_origin_extrinsic_metadata ( + like origin_extrinsic_metadata including defaults + ) on commit delete rows; +$$; + +comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata'; + +create or replace function swh_mktemp_indexer_configuration() + returns void + language sql +as $$ + create temporary table if not exists tmp_indexer_configuration ( + like indexer_configuration including defaults + ) on commit delete rows; + alter table tmp_indexer_configuration drop column if exists id; +$$; + +-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata, +-- overwriting duplicates. +-- +-- If filtering duplicates is in order, the call to +-- swh_origin_extrinsic_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_origin_extrinsic_metadata, 2. call this function +create or replace function swh_origin_extrinsic_metadata_add() + returns bigint + language plpgsql +as $$ +declare + res bigint; +begin + perform swh_origin_extrinsic_metadata_compute_tsvector(); + + insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings) + select id, metadata, indexer_configuration_id, from_remd_id, + metadata_tsvector, mappings + from tmp_origin_extrinsic_metadata + on conflict(id, indexer_configuration_id) + do update set + metadata = excluded.metadata, + metadata_tsvector = excluded.metadata_tsvector, + mappings = excluded.mappings, + from_remd_id = excluded.from_remd_id; + + get diagnostics res = ROW_COUNT; + return res; +end +$$; + +comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata'; + + +-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata. +-- +-- It uses the "pg_catalog.simple" dictionary, as it has no stopword, +-- so it should be suitable for proper names and non-English text. +create or replace function swh_origin_extrinsic_metadata_compute_tsvector() + returns void + language plpgsql +as $$ +begin + update tmp_origin_extrinsic_metadata + set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata); +end +$$; + +-- origin_extrinsic_metadata +create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id); +alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey; + +alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey; + +create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector); +create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings); diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -31,6 +31,7 @@ ContentMetadataRow, ContentMimetypeRow, DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -122,7 +123,7 @@ class IndexerStorage: """SWH Indexer Storage Datastore""" - current_version = 134 + current_version = 135 def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None): """ @@ -706,6 +707,52 @@ "per_mapping": results, } + @timed + @db_transaction() + def origin_extrinsic_metadata_get( + self, urls: Iterable[str], db=None, cur=None + ) -> List[OriginExtrinsicMetadataRow]: + return [ + OriginExtrinsicMetadataRow.from_dict( + converters.db_to_metadata( + dict(zip(db.origin_extrinsic_metadata_cols, c)) + ) + ) + for c in db.origin_extrinsic_metadata_get_from_list(urls, cur) + ] + + @timed + @process_metrics + @db_transaction() + def origin_extrinsic_metadata_add( + self, + metadata: List[OriginExtrinsicMetadataRow], + db=None, + cur=None, + ) -> Dict[str, int]: + check_id_duplicates(metadata) + metadata.sort(key=lambda m: m.id) + self.journal_writer.write_additions("origin_extrinsic_metadata", metadata) + + db.mktemp_origin_extrinsic_metadata(cur) + + db.copy_to( + [m.to_dict() for m in metadata], + "tmp_origin_extrinsic_metadata", + [ + "id", + "metadata", + "indexer_configuration_id", + "from_remd_id", + "mappings", + ], + cur, + ) + count = db.origin_extrinsic_metadata_add_from_temp(cur) + return { + "origin_extrinsic_metadata:add": count, + } + @timed @db_transaction() def indexer_configuration_add(self, tools, db=None, cur=None): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -486,6 +486,35 @@ cur.execute(" ".join(query_parts), args) yield from cur + origin_extrinsic_metadata_cols = [ + "id", + "metadata", + "from_remd_id", + "mappings", + "tool_id", + "tool_name", + "tool_version", + "tool_configuration", + ] + + @stored_procedure("swh_mktemp_origin_extrinsic_metadata") + def mktemp_origin_extrinsic_metadata(self, cur=None): + pass + + def origin_extrinsic_metadata_add_from_temp(self, cur=None): + cur = self._cursor(cur) + cur.execute("select * from swh_origin_extrinsic_metadata_add()") + return cur.fetchone()[0] + + def origin_extrinsic_metadata_get_from_list(self, ids, cur=None): + yield from self._get_from_list( + "origin_extrinsic_metadata", + ids, + self.origin_extrinsic_metadata_cols, + cur=cur, + id_col="id", + ) + indexer_configuration_cols = [ "id", "tool_name", diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -39,6 +39,7 @@ ContentMetadataRow, ContentMimetypeRow, DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from .writer import JournalWriter @@ -254,6 +255,7 @@ DirectoryIntrinsicMetadataRow, *args ) self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args) + self._origin_extrinsic_metadata = SubStorage(OriginExtrinsicMetadataRow, *args) def check_config(self, *, check_write): return True @@ -483,6 +485,17 @@ mapping_count[mapping] += 1 return {"per_mapping": mapping_count, "total": total, "non_empty": non_empty} + def origin_extrinsic_metadata_get( + self, urls: Iterable[str] + ) -> List[OriginExtrinsicMetadataRow]: + return self._origin_extrinsic_metadata.get(urls) + + def origin_extrinsic_metadata_add( + self, metadata: List[OriginExtrinsicMetadataRow] + ) -> Dict[str, int]: + added = self._origin_extrinsic_metadata.add(metadata) + return {"origin_extrinsic_metadata:add": added} + def indexer_configuration_add(self, tools): inserted = [] for tool in tools: diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py --- a/swh/indexer/storage/interface.py +++ b/swh/indexer/storage/interface.py @@ -16,6 +16,7 @@ ContentMetadataRow, ContentMimetypeRow, DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) @@ -479,6 +480,34 @@ """ ... + @remote_api_endpoint("origin_extrinsic_metadata") + def origin_extrinsic_metadata_get( + self, urls: Iterable[str] + ) -> List[OriginExtrinsicMetadataRow]: + """Retrieve origin metadata per id. + + Args: + urls (iterable): origin URLs + + Returns: list of OriginExtrinsicMetadataRow + """ + ... + + @remote_api_endpoint("origin_extrinsic_metadata/add") + def origin_extrinsic_metadata_add( + self, metadata: List[OriginExtrinsicMetadataRow] + ) -> Dict[str, int]: + """Add origin metadata not present in storage. + + Args: + metadata: list of OriginExtrinsicMetadataRow objects + + Returns: + Dict summary of number of rows added + + """ + ... + @remote_api_endpoint("indexer_configuration/add") def indexer_configuration_add(self, tools): """Add new tools to the storage. diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py --- a/swh/indexer/storage/model.py +++ b/swh/indexer/storage/model.py @@ -136,3 +136,15 @@ metadata = attr.ib(type=Dict[str, Any]) from_directory = attr.ib(type=Sha1Git) mappings = attr.ib(type=List[str]) + + +@attr.s +class OriginExtrinsicMetadataRow(BaseRow): + object_type: Final = "origin_extrinsic_metadata" + + id = attr.ib(type=str) + """origin URL""" + metadata = attr.ib(type=Dict[str, Any]) + from_remd_id = attr.ib(type=Sha1Git) + """id of the RawExtrinsicMetadata object used as source for indexed metadata""" + mappings = attr.ib(type=List[str]) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -20,6 +20,7 @@ ContentMetadataRow, ContentMimetypeRow, DirectoryIntrinsicMetadataRow, + OriginExtrinsicMetadataRow, OriginIntrinsicMetadataRow, ) from swh.model.hashutil import hash_to_bytes @@ -1712,6 +1713,253 @@ } +class TestIndexerStorageOriginExtrinsicMetadata: + def test_origin_extrinsic_metadata_add( + self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] + ) -> None: + storage, data = swh_indexer_storage_with_data + # given + tool_id = data.tools["swh-metadata-detector"]["id"] + + metadata = { + "version": None, + "name": None, + } + metadata_origin = OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata, + indexer_configuration_id=tool_id, + mappings=["mapping1"], + from_remd_id=b"\x02" * 20, + ) + + # when + storage.origin_extrinsic_metadata_add([metadata_origin]) + + # then + actual_metadata = list( + storage.origin_extrinsic_metadata_get([data.origin_url_1, "no://where"]) + ) + + expected_metadata = [ + OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata, + tool=data.tools["swh-metadata-detector"], + from_remd_id=b"\x02" * 20, + mappings=["mapping1"], + ) + ] + + assert actual_metadata == expected_metadata + + journal_objects = storage.journal_writer.journal.objects # type: ignore + actual_journal_metadata = [ + obj + for (obj_type, obj) in journal_objects + if obj_type == "origin_extrinsic_metadata" + ] + assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata)) + + def test_origin_extrinsic_metadata_add_update_in_place_duplicate( + self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] + ) -> None: + storage, data = swh_indexer_storage_with_data + # given + tool_id = data.tools["swh-metadata-detector"]["id"] + + metadata_v1: Dict[str, Any] = { + "version": None, + "name": None, + } + metadata_origin_v1 = OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata_v1.copy(), + indexer_configuration_id=tool_id, + mappings=[], + from_remd_id=b"\x02" * 20, + ) + + # given + storage.origin_extrinsic_metadata_add([metadata_origin_v1]) + + # when + actual_metadata = list( + storage.origin_extrinsic_metadata_get([data.origin_url_1]) + ) + + # then + expected_metadata_v1 = [ + OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata_v1, + tool=data.tools["swh-metadata-detector"], + from_remd_id=b"\x02" * 20, + mappings=[], + ) + ] + assert actual_metadata == expected_metadata_v1 + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update( + { + "name": "test_update_duplicated_metadata", + "author": "MG", + } + ) + metadata_origin_v2 = OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata_v2.copy(), + indexer_configuration_id=tool_id, + mappings=["github"], + from_remd_id=b"\x02" * 20, + ) + + storage.origin_extrinsic_metadata_add([metadata_origin_v2]) + + actual_metadata = list( + storage.origin_extrinsic_metadata_get([data.origin_url_1]) + ) + + expected_metadata_v2 = [ + OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata_v2, + tool=data.tools["swh-metadata-detector"], + from_remd_id=b"\x02" * 20, + mappings=["github"], + ) + ] + + # metadata did change as the v2 was used to overwrite v1 + assert actual_metadata == expected_metadata_v2 + + def test_origin_extrinsic_metadata_add__deadlock( + self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] + ) -> None: + storage, data = swh_indexer_storage_with_data + # given + tool_id = data.tools["swh-metadata-detector"]["id"] + + origins = ["file:///tmp/origin{:02d}".format(i) for i in range(100)] + + example_data1: Dict[str, Any] = { + "metadata": { + "version": None, + "name": None, + }, + "mappings": [], + } + example_data2: Dict[str, Any] = { + "metadata": { + "version": "v1.1.1", + "name": "foo", + }, + "mappings": [], + } + + data_v1 = [ + OriginExtrinsicMetadataRow( + id=origin, + from_remd_id=b"\x02" * 20, + indexer_configuration_id=tool_id, + **example_data1, + ) + for origin in origins + ] + data_v2 = [ + OriginExtrinsicMetadataRow( + id=origin, + from_remd_id=b"\x02" * 20, + indexer_configuration_id=tool_id, + **example_data2, + ) + for origin in origins + ] + + # Remove one item from each, so that both queries have to succeed for + # all items to be in the DB. + data_v2a = data_v2[1:] + data_v2b = list(reversed(data_v2[0:-1])) + + # given + storage.origin_extrinsic_metadata_add(data_v1) + + # when + actual_data = list(storage.origin_extrinsic_metadata_get(origins)) + + expected_data_v1 = [ + OriginExtrinsicMetadataRow( + id=origin, + from_remd_id=b"\x02" * 20, + tool=data.tools["swh-metadata-detector"], + **example_data1, + ) + for origin in origins + ] + + # then + assert actual_data == expected_data_v1 + + # given + def f1() -> None: + storage.origin_extrinsic_metadata_add(data_v2a) + + def f2() -> None: + storage.origin_extrinsic_metadata_add(data_v2b) + + t1 = threading.Thread(target=f1) + t2 = threading.Thread(target=f2) + t2.start() + t1.start() + + t1.join() + t2.join() + + actual_data = list(storage.origin_extrinsic_metadata_get(origins)) + + expected_data_v2 = [ + OriginExtrinsicMetadataRow( + id=origin, + from_remd_id=b"\x02" * 20, + tool=data.tools["swh-metadata-detector"], + **example_data2, + ) + for origin in origins + ] + + actual_data.sort(key=lambda item: item.id) + assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2) + for (item, expected_item_v1, expected_item_v2) in zip( + actual_data, expected_data_v1, expected_data_v2 + ): + assert item in (expected_item_v1, expected_item_v2) + + def test_origin_extrinsic_metadata_add__duplicate_twice( + self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any] + ) -> None: + storage, data = swh_indexer_storage_with_data + # given + tool_id = data.tools["swh-metadata-detector"]["id"] + + metadata = { + "developmentStatus": None, + "name": None, + } + metadata_origin = OriginExtrinsicMetadataRow( + id=data.origin_url_1, + metadata=metadata, + indexer_configuration_id=tool_id, + mappings=["mapping1"], + from_remd_id=b"\x02" * 20, + ) + + # when + with pytest.raises(DuplicateId): + storage.origin_extrinsic_metadata_add([metadata_origin, metadata_origin]) + + class TestIndexerStorageIndexerConfiguration: def test_indexer_configuration_add( self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]