Page MenuHomeSoftware Heritage

D8058.id29125.diff
No OneTemporary

D8058.id29125.diff

diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -130,3 +130,19 @@
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+create table origin_extrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_remd_id sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin';
+comment on column origin_extrinsic_metadata.id is 'url of the origin';
+comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -389,6 +389,79 @@
end
$$;
+-- create a temporary table for retrieving origin_extrinsic_metadata
+create or replace function swh_mktemp_origin_extrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_extrinsic_metadata (
+ like origin_extrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata';
+
+create or replace function swh_mktemp_indexer_configuration()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_indexer_configuration (
+ like indexer_configuration including defaults
+ ) on commit delete rows;
+ alter table tmp_indexer_configuration drop column if exists id;
+$$;
+
+-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_extrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_extrinsic_metadata, 2. call this function
+create or replace function swh_origin_extrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_extrinsic_metadata_compute_tsvector();
+
+ insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_remd_id,
+ metadata_tsvector, mappings
+ from tmp_origin_extrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_remd_id = excluded.from_remd_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata';
+
+
+-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata.
+--
+-- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
+-- so it should be suitable for proper names and non-English text.
+create or replace function swh_origin_extrinsic_metadata_compute_tsvector()
+ returns void
+ language plpgsql
+as $$
+begin
+ update tmp_origin_extrinsic_metadata
+ set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
+end
+$$;
+
-- add tmp_indexer_configuration entries to indexer_configuration,
-- overwriting duplicates if any.
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -67,3 +67,13 @@
create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
+
+-- origin_extrinsic_metadata
+create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
+alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
+
+alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
+create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/sql/upgrades/135.sql b/swh/indexer/sql/upgrades/135.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/135.sql
@@ -0,0 +1,106 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 134
+-- to_version: 135
+-- description: Add support for origin_extrinsic_metadata
+
+insert into dbversion(version, release, description)
+ values(135, now(), 'Work In Progress');
+
+create table origin_extrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_remd_id sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin';
+comment on column origin_extrinsic_metadata.id is 'url of the origin';
+comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)';
+
+-- create a temporary table for retrieving origin_extrinsic_metadata
+create or replace function swh_mktemp_origin_extrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_extrinsic_metadata (
+ like origin_extrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata';
+
+create or replace function swh_mktemp_indexer_configuration()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_indexer_configuration (
+ like indexer_configuration including defaults
+ ) on commit delete rows;
+ alter table tmp_indexer_configuration drop column if exists id;
+$$;
+
+-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_extrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_extrinsic_metadata, 2. call this function
+create or replace function swh_origin_extrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_extrinsic_metadata_compute_tsvector();
+
+ insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_remd_id,
+ metadata_tsvector, mappings
+ from tmp_origin_extrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_remd_id = excluded.from_remd_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata';
+
+
+-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata.
+--
+-- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
+-- so it should be suitable for proper names and non-English text.
+create or replace function swh_origin_extrinsic_metadata_compute_tsvector()
+ returns void
+ language plpgsql
+as $$
+begin
+ update tmp_origin_extrinsic_metadata
+ set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
+end
+$$;
+
+-- origin_extrinsic_metadata
+create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
+alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
+
+alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
+create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -31,6 +31,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -122,7 +123,7 @@
class IndexerStorage:
"""SWH Indexer Storage Datastore"""
- current_version = 134
+ current_version = 135
def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
"""
@@ -706,6 +707,52 @@
"per_mapping": results,
}
+ @timed
+ @db_transaction()
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str], db=None, cur=None
+ ) -> List[OriginExtrinsicMetadataRow]:
+ return [
+ OriginExtrinsicMetadataRow.from_dict(
+ converters.db_to_metadata(
+ dict(zip(db.origin_extrinsic_metadata_cols, c))
+ )
+ )
+ for c in db.origin_extrinsic_metadata_get_from_list(urls, cur)
+ ]
+
+ @timed
+ @process_metrics
+ @db_transaction()
+ def origin_extrinsic_metadata_add(
+ self,
+ metadata: List[OriginExtrinsicMetadataRow],
+ db=None,
+ cur=None,
+ ) -> Dict[str, int]:
+ check_id_duplicates(metadata)
+ metadata.sort(key=lambda m: m.id)
+ self.journal_writer.write_additions("origin_extrinsic_metadata", metadata)
+
+ db.mktemp_origin_extrinsic_metadata(cur)
+
+ db.copy_to(
+ [m.to_dict() for m in metadata],
+ "tmp_origin_extrinsic_metadata",
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_remd_id",
+ "mappings",
+ ],
+ cur,
+ )
+ count = db.origin_extrinsic_metadata_add_from_temp(cur)
+ return {
+ "origin_extrinsic_metadata:add": count,
+ }
+
@timed
@db_transaction()
def indexer_configuration_add(self, tools, db=None, cur=None):
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -486,6 +486,35 @@
cur.execute(" ".join(query_parts), args)
yield from cur
+ origin_extrinsic_metadata_cols = [
+ "id",
+ "metadata",
+ "from_remd_id",
+ "mappings",
+ "tool_id",
+ "tool_name",
+ "tool_version",
+ "tool_configuration",
+ ]
+
+ @stored_procedure("swh_mktemp_origin_extrinsic_metadata")
+ def mktemp_origin_extrinsic_metadata(self, cur=None):
+ pass
+
+ def origin_extrinsic_metadata_add_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ cur.execute("select * from swh_origin_extrinsic_metadata_add()")
+ return cur.fetchone()[0]
+
+ def origin_extrinsic_metadata_get_from_list(self, ids, cur=None):
+ yield from self._get_from_list(
+ "origin_extrinsic_metadata",
+ ids,
+ self.origin_extrinsic_metadata_cols,
+ cur=cur,
+ id_col="id",
+ )
+
indexer_configuration_cols = [
"id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -39,6 +39,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -254,6 +255,7 @@
DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
+ self._origin_extrinsic_metadata = SubStorage(OriginExtrinsicMetadataRow, *args)
def check_config(self, *, check_write):
return True
@@ -483,6 +485,17 @@
mapping_count[mapping] += 1
return {"per_mapping": mapping_count, "total": total, "non_empty": non_empty}
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str]
+ ) -> List[OriginExtrinsicMetadataRow]:
+ return self._origin_extrinsic_metadata.get(urls)
+
+ def origin_extrinsic_metadata_add(
+ self, metadata: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ added = self._origin_extrinsic_metadata.add(metadata)
+ return {"origin_extrinsic_metadata:add": added}
+
def indexer_configuration_add(self, tools):
inserted = []
for tool in tools:
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -16,6 +16,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
@@ -479,6 +480,34 @@
"""
...
+ @remote_api_endpoint("origin_extrinsic_metadata")
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str]
+ ) -> List[OriginExtrinsicMetadataRow]:
+ """Retrieve origin metadata per id.
+
+ Args:
+ urls (iterable): origin URLs
+
+ Returns: list of OriginExtrinsicMetadataRow
+ """
+ ...
+
+ @remote_api_endpoint("origin_extrinsic_metadata/add")
+ def origin_extrinsic_metadata_add(
+ self, metadata: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ """Add origin metadata not present in storage.
+
+ Args:
+ metadata: list of OriginExtrinsicMetadataRow objects
+
+ Returns:
+ Dict summary of number of rows added
+
+ """
+ ...
+
@remote_api_endpoint("indexer_configuration/add")
def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -136,3 +136,15 @@
metadata = attr.ib(type=Dict[str, Any])
from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
+
+
+@attr.s
+class OriginExtrinsicMetadataRow(BaseRow):
+ object_type: Final = "origin_extrinsic_metadata"
+
+ id = attr.ib(type=str)
+ """origin URL"""
+ metadata = attr.ib(type=Dict[str, Any])
+ from_remd_id = attr.ib(type=Sha1Git)
+ """id of the RawExtrinsicMetadata object used as source for indexed metadata"""
+ mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -20,6 +20,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -1712,6 +1713,253 @@
}
+class TestIndexerStorageOriginExtrinsicMetadata:
+ def test_origin_extrinsic_metadata_add(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata = {
+ "version": None,
+ "name": None,
+ }
+ metadata_origin = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ indexer_configuration_id=tool_id,
+ mappings=["mapping1"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # when
+ storage.origin_extrinsic_metadata_add([metadata_origin])
+
+ # then
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1, "no://where"])
+ )
+
+ expected_metadata = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=["mapping1"],
+ )
+ ]
+
+ assert actual_metadata == expected_metadata
+
+ journal_objects = storage.journal_writer.journal.objects # type: ignore
+ actual_journal_metadata = [
+ obj
+ for (obj_type, obj) in journal_objects
+ if obj_type == "origin_extrinsic_metadata"
+ ]
+ assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata))
+
+ def test_origin_extrinsic_metadata_add_update_in_place_duplicate(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata_v1: Dict[str, Any] = {
+ "version": None,
+ "name": None,
+ }
+ metadata_origin_v1 = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v1.copy(),
+ indexer_configuration_id=tool_id,
+ mappings=[],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # given
+ storage.origin_extrinsic_metadata_add([metadata_origin_v1])
+
+ # when
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1])
+ )
+
+ # then
+ expected_metadata_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v1,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=[],
+ )
+ ]
+ assert actual_metadata == expected_metadata_v1
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update(
+ {
+ "name": "test_update_duplicated_metadata",
+ "author": "MG",
+ }
+ )
+ metadata_origin_v2 = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v2.copy(),
+ indexer_configuration_id=tool_id,
+ mappings=["github"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ storage.origin_extrinsic_metadata_add([metadata_origin_v2])
+
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1])
+ )
+
+ expected_metadata_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v2,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=["github"],
+ )
+ ]
+
+ # metadata did change as the v2 was used to overwrite v1
+ assert actual_metadata == expected_metadata_v2
+
+ def test_origin_extrinsic_metadata_add__deadlock(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ origins = ["file:///tmp/origin{:02d}".format(i) for i in range(100)]
+
+ example_data1: Dict[str, Any] = {
+ "metadata": {
+ "version": None,
+ "name": None,
+ },
+ "mappings": [],
+ }
+ example_data2: Dict[str, Any] = {
+ "metadata": {
+ "version": "v1.1.1",
+ "name": "foo",
+ },
+ "mappings": [],
+ }
+
+ data_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ indexer_configuration_id=tool_id,
+ **example_data1,
+ )
+ for origin in origins
+ ]
+ data_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ indexer_configuration_id=tool_id,
+ **example_data2,
+ )
+ for origin in origins
+ ]
+
+ # Remove one item from each, so that both queries have to succeed for
+ # all items to be in the DB.
+ data_v2a = data_v2[1:]
+ data_v2b = list(reversed(data_v2[0:-1]))
+
+ # given
+ storage.origin_extrinsic_metadata_add(data_v1)
+
+ # when
+ actual_data = list(storage.origin_extrinsic_metadata_get(origins))
+
+ expected_data_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ tool=data.tools["swh-metadata-detector"],
+ **example_data1,
+ )
+ for origin in origins
+ ]
+
+ # then
+ assert actual_data == expected_data_v1
+
+ # given
+ def f1() -> None:
+ storage.origin_extrinsic_metadata_add(data_v2a)
+
+ def f2() -> None:
+ storage.origin_extrinsic_metadata_add(data_v2b)
+
+ t1 = threading.Thread(target=f1)
+ t2 = threading.Thread(target=f2)
+ t2.start()
+ t1.start()
+
+ t1.join()
+ t2.join()
+
+ actual_data = list(storage.origin_extrinsic_metadata_get(origins))
+
+ expected_data_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ tool=data.tools["swh-metadata-detector"],
+ **example_data2,
+ )
+ for origin in origins
+ ]
+
+ actual_data.sort(key=lambda item: item.id)
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
+
+ def test_origin_extrinsic_metadata_add__duplicate_twice(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata = {
+ "developmentStatus": None,
+ "name": None,
+ }
+ metadata_origin = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ indexer_configuration_id=tool_id,
+ mappings=["mapping1"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # when
+ with pytest.raises(DuplicateId):
+ storage.origin_extrinsic_metadata_add([metadata_origin, metadata_origin])
+
+
class TestIndexerStorageIndexerConfiguration:
def test_indexer_configuration_add(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:25 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217082

Event Timeline