Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345576
D8058.id29125.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
24 KB
Subscribers
None
D8058.id29125.diff
View Options
diff --git a/swh/indexer/sql/30-schema.sql b/swh/indexer/sql/30-schema.sql
--- a/swh/indexer/sql/30-schema.sql
+++ b/swh/indexer/sql/30-schema.sql
@@ -130,3 +130,19 @@
comment on column origin_intrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
comment on column origin_intrinsic_metadata.from_directory is 'sha1 of the directory this metadata was copied from.';
comment on column origin_intrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. pkg-info, npm)';
+
+create table origin_extrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_remd_id sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin';
+comment on column origin_extrinsic_metadata.id is 'url of the origin';
+comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)';
diff --git a/swh/indexer/sql/50-func.sql b/swh/indexer/sql/50-func.sql
--- a/swh/indexer/sql/50-func.sql
+++ b/swh/indexer/sql/50-func.sql
@@ -389,6 +389,79 @@
end
$$;
+-- create a temporary table for retrieving origin_extrinsic_metadata
+create or replace function swh_mktemp_origin_extrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_extrinsic_metadata (
+ like origin_extrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata';
+
+create or replace function swh_mktemp_indexer_configuration()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_indexer_configuration (
+ like indexer_configuration including defaults
+ ) on commit delete rows;
+ alter table tmp_indexer_configuration drop column if exists id;
+$$;
+
+-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_extrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_extrinsic_metadata, 2. call this function
+create or replace function swh_origin_extrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_extrinsic_metadata_compute_tsvector();
+
+ insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_remd_id,
+ metadata_tsvector, mappings
+ from tmp_origin_extrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_remd_id = excluded.from_remd_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata';
+
+
+-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata.
+--
+-- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
+-- so it should be suitable for proper names and non-English text.
+create or replace function swh_origin_extrinsic_metadata_compute_tsvector()
+ returns void
+ language plpgsql
+as $$
+begin
+ update tmp_origin_extrinsic_metadata
+ set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
+end
+$$;
+
-- add tmp_indexer_configuration entries to indexer_configuration,
-- overwriting duplicates if any.
diff --git a/swh/indexer/sql/60-indexes.sql b/swh/indexer/sql/60-indexes.sql
--- a/swh/indexer/sql/60-indexes.sql
+++ b/swh/indexer/sql/60-indexes.sql
@@ -67,3 +67,13 @@
create index origin_intrinsic_metadata_fulltext_idx on origin_intrinsic_metadata using gin (metadata_tsvector);
create index origin_intrinsic_metadata_mappings_idx on origin_intrinsic_metadata using gin (mappings);
+
+-- origin_extrinsic_metadata
+create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
+alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
+
+alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
+create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/sql/upgrades/135.sql b/swh/indexer/sql/upgrades/135.sql
new file mode 100644
--- /dev/null
+++ b/swh/indexer/sql/upgrades/135.sql
@@ -0,0 +1,106 @@
+-- SWH Indexer DB schema upgrade
+-- from_version: 134
+-- to_version: 135
+-- description: Add support for origin_extrinsic_metadata
+
+insert into dbversion(version, release, description)
+ values(135, now(), 'Work In Progress');
+
+create table origin_extrinsic_metadata(
+ id text not null, -- origin url
+ metadata jsonb,
+ indexer_configuration_id bigint not null,
+ from_remd_id sha1_git not null,
+ metadata_tsvector tsvector,
+ mappings text array not null
+);
+
+comment on table origin_extrinsic_metadata is 'keeps extrinsic metadata for an origin';
+comment on column origin_extrinsic_metadata.id is 'url of the origin';
+comment on column origin_extrinsic_metadata.metadata is 'metadata extracted from a directory';
+comment on column origin_extrinsic_metadata.indexer_configuration_id is 'tool used to generate this metadata';
+comment on column origin_extrinsic_metadata.from_remd_id is 'sha1 of the directory this metadata was copied from.';
+comment on column origin_extrinsic_metadata.mappings is 'type of metadata files used to obtain this metadata (eg. github, gitlab)';
+
+-- create a temporary table for retrieving origin_extrinsic_metadata
+create or replace function swh_mktemp_origin_extrinsic_metadata()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_origin_extrinsic_metadata (
+ like origin_extrinsic_metadata including defaults
+ ) on commit delete rows;
+$$;
+
+comment on function swh_mktemp_origin_extrinsic_metadata() is 'Helper table to add origin extrinsic metadata';
+
+create or replace function swh_mktemp_indexer_configuration()
+ returns void
+ language sql
+as $$
+ create temporary table if not exists tmp_indexer_configuration (
+ like indexer_configuration including defaults
+ ) on commit delete rows;
+ alter table tmp_indexer_configuration drop column if exists id;
+$$;
+
+-- add tmp_origin_extrinsic_metadata entries to origin_extrinsic_metadata,
+-- overwriting duplicates.
+--
+-- If filtering duplicates is in order, the call to
+-- swh_origin_extrinsic_metadata_missing must take place before calling this
+-- function.
+--
+-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to
+-- tmp_origin_extrinsic_metadata, 2. call this function
+create or replace function swh_origin_extrinsic_metadata_add()
+ returns bigint
+ language plpgsql
+as $$
+declare
+ res bigint;
+begin
+ perform swh_origin_extrinsic_metadata_compute_tsvector();
+
+ insert into origin_extrinsic_metadata (id, metadata, indexer_configuration_id, from_remd_id, metadata_tsvector, mappings)
+ select id, metadata, indexer_configuration_id, from_remd_id,
+ metadata_tsvector, mappings
+ from tmp_origin_extrinsic_metadata
+ on conflict(id, indexer_configuration_id)
+ do update set
+ metadata = excluded.metadata,
+ metadata_tsvector = excluded.metadata_tsvector,
+ mappings = excluded.mappings,
+ from_remd_id = excluded.from_remd_id;
+
+ get diagnostics res = ROW_COUNT;
+ return res;
+end
+$$;
+
+comment on function swh_origin_extrinsic_metadata_add() IS 'Add new origin extrinsic metadata';
+
+
+-- Compute the metadata_tsvector column in tmp_origin_extrinsic_metadata.
+--
+-- It uses the "pg_catalog.simple" dictionary, as it has no stopword,
+-- so it should be suitable for proper names and non-English text.
+create or replace function swh_origin_extrinsic_metadata_compute_tsvector()
+ returns void
+ language plpgsql
+as $$
+begin
+ update tmp_origin_extrinsic_metadata
+ set metadata_tsvector = to_tsvector('pg_catalog.simple', metadata);
+end
+$$;
+
+-- origin_extrinsic_metadata
+create unique index origin_extrinsic_metadata_pkey on origin_extrinsic_metadata(id, indexer_configuration_id);
+alter table origin_extrinsic_metadata add primary key using index origin_extrinsic_metadata_pkey;
+
+alter table origin_extrinsic_metadata add constraint origin_extrinsic_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid;
+alter table origin_extrinsic_metadata validate constraint origin_extrinsic_metadata_indexer_configuration_id_fkey;
+
+create index origin_extrinsic_metadata_fulltext_idx on origin_extrinsic_metadata using gin (metadata_tsvector);
+create index origin_extrinsic_metadata_mappings_idx on origin_extrinsic_metadata using gin (mappings);
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -31,6 +31,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -122,7 +123,7 @@
class IndexerStorage:
"""SWH Indexer Storage Datastore"""
- current_version = 134
+ current_version = 135
def __init__(self, db, min_pool_conns=1, max_pool_conns=10, journal_writer=None):
"""
@@ -706,6 +707,52 @@
"per_mapping": results,
}
+ @timed
+ @db_transaction()
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str], db=None, cur=None
+ ) -> List[OriginExtrinsicMetadataRow]:
+ return [
+ OriginExtrinsicMetadataRow.from_dict(
+ converters.db_to_metadata(
+ dict(zip(db.origin_extrinsic_metadata_cols, c))
+ )
+ )
+ for c in db.origin_extrinsic_metadata_get_from_list(urls, cur)
+ ]
+
+ @timed
+ @process_metrics
+ @db_transaction()
+ def origin_extrinsic_metadata_add(
+ self,
+ metadata: List[OriginExtrinsicMetadataRow],
+ db=None,
+ cur=None,
+ ) -> Dict[str, int]:
+ check_id_duplicates(metadata)
+ metadata.sort(key=lambda m: m.id)
+ self.journal_writer.write_additions("origin_extrinsic_metadata", metadata)
+
+ db.mktemp_origin_extrinsic_metadata(cur)
+
+ db.copy_to(
+ [m.to_dict() for m in metadata],
+ "tmp_origin_extrinsic_metadata",
+ [
+ "id",
+ "metadata",
+ "indexer_configuration_id",
+ "from_remd_id",
+ "mappings",
+ ],
+ cur,
+ )
+ count = db.origin_extrinsic_metadata_add_from_temp(cur)
+ return {
+ "origin_extrinsic_metadata:add": count,
+ }
+
@timed
@db_transaction()
def indexer_configuration_add(self, tools, db=None, cur=None):
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -486,6 +486,35 @@
cur.execute(" ".join(query_parts), args)
yield from cur
+ origin_extrinsic_metadata_cols = [
+ "id",
+ "metadata",
+ "from_remd_id",
+ "mappings",
+ "tool_id",
+ "tool_name",
+ "tool_version",
+ "tool_configuration",
+ ]
+
+ @stored_procedure("swh_mktemp_origin_extrinsic_metadata")
+ def mktemp_origin_extrinsic_metadata(self, cur=None):
+ pass
+
+ def origin_extrinsic_metadata_add_from_temp(self, cur=None):
+ cur = self._cursor(cur)
+ cur.execute("select * from swh_origin_extrinsic_metadata_add()")
+ return cur.fetchone()[0]
+
+ def origin_extrinsic_metadata_get_from_list(self, ids, cur=None):
+ yield from self._get_from_list(
+ "origin_extrinsic_metadata",
+ ids,
+ self.origin_extrinsic_metadata_cols,
+ cur=cur,
+ id_col="id",
+ )
+
indexer_configuration_cols = [
"id",
"tool_name",
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -39,6 +39,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from .writer import JournalWriter
@@ -254,6 +255,7 @@
DirectoryIntrinsicMetadataRow, *args
)
self._origin_intrinsic_metadata = SubStorage(OriginIntrinsicMetadataRow, *args)
+ self._origin_extrinsic_metadata = SubStorage(OriginExtrinsicMetadataRow, *args)
def check_config(self, *, check_write):
return True
@@ -483,6 +485,17 @@
mapping_count[mapping] += 1
return {"per_mapping": mapping_count, "total": total, "non_empty": non_empty}
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str]
+ ) -> List[OriginExtrinsicMetadataRow]:
+ return self._origin_extrinsic_metadata.get(urls)
+
+ def origin_extrinsic_metadata_add(
+ self, metadata: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ added = self._origin_extrinsic_metadata.add(metadata)
+ return {"origin_extrinsic_metadata:add": added}
+
def indexer_configuration_add(self, tools):
inserted = []
for tool in tools:
diff --git a/swh/indexer/storage/interface.py b/swh/indexer/storage/interface.py
--- a/swh/indexer/storage/interface.py
+++ b/swh/indexer/storage/interface.py
@@ -16,6 +16,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
@@ -479,6 +480,34 @@
"""
...
+ @remote_api_endpoint("origin_extrinsic_metadata")
+ def origin_extrinsic_metadata_get(
+ self, urls: Iterable[str]
+ ) -> List[OriginExtrinsicMetadataRow]:
+ """Retrieve origin metadata per id.
+
+ Args:
+ urls (iterable): origin URLs
+
+ Returns: list of OriginExtrinsicMetadataRow
+ """
+ ...
+
+ @remote_api_endpoint("origin_extrinsic_metadata/add")
+ def origin_extrinsic_metadata_add(
+ self, metadata: List[OriginExtrinsicMetadataRow]
+ ) -> Dict[str, int]:
+ """Add origin metadata not present in storage.
+
+ Args:
+ metadata: list of OriginExtrinsicMetadataRow objects
+
+ Returns:
+ Dict summary of number of rows added
+
+ """
+ ...
+
@remote_api_endpoint("indexer_configuration/add")
def indexer_configuration_add(self, tools):
"""Add new tools to the storage.
diff --git a/swh/indexer/storage/model.py b/swh/indexer/storage/model.py
--- a/swh/indexer/storage/model.py
+++ b/swh/indexer/storage/model.py
@@ -136,3 +136,15 @@
metadata = attr.ib(type=Dict[str, Any])
from_directory = attr.ib(type=Sha1Git)
mappings = attr.ib(type=List[str])
+
+
+@attr.s
+class OriginExtrinsicMetadataRow(BaseRow):
+ object_type: Final = "origin_extrinsic_metadata"
+
+ id = attr.ib(type=str)
+ """origin URL"""
+ metadata = attr.ib(type=Dict[str, Any])
+ from_remd_id = attr.ib(type=Sha1Git)
+ """id of the RawExtrinsicMetadata object used as source for indexed metadata"""
+ mappings = attr.ib(type=List[str])
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -20,6 +20,7 @@
ContentMetadataRow,
ContentMimetypeRow,
DirectoryIntrinsicMetadataRow,
+ OriginExtrinsicMetadataRow,
OriginIntrinsicMetadataRow,
)
from swh.model.hashutil import hash_to_bytes
@@ -1712,6 +1713,253 @@
}
+class TestIndexerStorageOriginExtrinsicMetadata:
+ def test_origin_extrinsic_metadata_add(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata = {
+ "version": None,
+ "name": None,
+ }
+ metadata_origin = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ indexer_configuration_id=tool_id,
+ mappings=["mapping1"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # when
+ storage.origin_extrinsic_metadata_add([metadata_origin])
+
+ # then
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1, "no://where"])
+ )
+
+ expected_metadata = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=["mapping1"],
+ )
+ ]
+
+ assert actual_metadata == expected_metadata
+
+ journal_objects = storage.journal_writer.journal.objects # type: ignore
+ actual_journal_metadata = [
+ obj
+ for (obj_type, obj) in journal_objects
+ if obj_type == "origin_extrinsic_metadata"
+ ]
+ assert list(sorted(actual_journal_metadata)) == list(sorted(expected_metadata))
+
+ def test_origin_extrinsic_metadata_add_update_in_place_duplicate(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata_v1: Dict[str, Any] = {
+ "version": None,
+ "name": None,
+ }
+ metadata_origin_v1 = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v1.copy(),
+ indexer_configuration_id=tool_id,
+ mappings=[],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # given
+ storage.origin_extrinsic_metadata_add([metadata_origin_v1])
+
+ # when
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1])
+ )
+
+ # then
+ expected_metadata_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v1,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=[],
+ )
+ ]
+ assert actual_metadata == expected_metadata_v1
+
+ # given
+ metadata_v2 = metadata_v1.copy()
+ metadata_v2.update(
+ {
+ "name": "test_update_duplicated_metadata",
+ "author": "MG",
+ }
+ )
+ metadata_origin_v2 = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v2.copy(),
+ indexer_configuration_id=tool_id,
+ mappings=["github"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ storage.origin_extrinsic_metadata_add([metadata_origin_v2])
+
+ actual_metadata = list(
+ storage.origin_extrinsic_metadata_get([data.origin_url_1])
+ )
+
+ expected_metadata_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata_v2,
+ tool=data.tools["swh-metadata-detector"],
+ from_remd_id=b"\x02" * 20,
+ mappings=["github"],
+ )
+ ]
+
+ # metadata did change as the v2 was used to overwrite v1
+ assert actual_metadata == expected_metadata_v2
+
+ def test_origin_extrinsic_metadata_add__deadlock(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ origins = ["file:///tmp/origin{:02d}".format(i) for i in range(100)]
+
+ example_data1: Dict[str, Any] = {
+ "metadata": {
+ "version": None,
+ "name": None,
+ },
+ "mappings": [],
+ }
+ example_data2: Dict[str, Any] = {
+ "metadata": {
+ "version": "v1.1.1",
+ "name": "foo",
+ },
+ "mappings": [],
+ }
+
+ data_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ indexer_configuration_id=tool_id,
+ **example_data1,
+ )
+ for origin in origins
+ ]
+ data_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ indexer_configuration_id=tool_id,
+ **example_data2,
+ )
+ for origin in origins
+ ]
+
+ # Remove one item from each, so that both queries have to succeed for
+ # all items to be in the DB.
+ data_v2a = data_v2[1:]
+ data_v2b = list(reversed(data_v2[0:-1]))
+
+ # given
+ storage.origin_extrinsic_metadata_add(data_v1)
+
+ # when
+ actual_data = list(storage.origin_extrinsic_metadata_get(origins))
+
+ expected_data_v1 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ tool=data.tools["swh-metadata-detector"],
+ **example_data1,
+ )
+ for origin in origins
+ ]
+
+ # then
+ assert actual_data == expected_data_v1
+
+ # given
+ def f1() -> None:
+ storage.origin_extrinsic_metadata_add(data_v2a)
+
+ def f2() -> None:
+ storage.origin_extrinsic_metadata_add(data_v2b)
+
+ t1 = threading.Thread(target=f1)
+ t2 = threading.Thread(target=f2)
+ t2.start()
+ t1.start()
+
+ t1.join()
+ t2.join()
+
+ actual_data = list(storage.origin_extrinsic_metadata_get(origins))
+
+ expected_data_v2 = [
+ OriginExtrinsicMetadataRow(
+ id=origin,
+ from_remd_id=b"\x02" * 20,
+ tool=data.tools["swh-metadata-detector"],
+ **example_data2,
+ )
+ for origin in origins
+ ]
+
+ actual_data.sort(key=lambda item: item.id)
+ assert len(actual_data) == len(expected_data_v1) == len(expected_data_v2)
+ for (item, expected_item_v1, expected_item_v2) in zip(
+ actual_data, expected_data_v1, expected_data_v2
+ ):
+ assert item in (expected_item_v1, expected_item_v2)
+
+ def test_origin_extrinsic_metadata_add__duplicate_twice(
+ self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
+ ) -> None:
+ storage, data = swh_indexer_storage_with_data
+ # given
+ tool_id = data.tools["swh-metadata-detector"]["id"]
+
+ metadata = {
+ "developmentStatus": None,
+ "name": None,
+ }
+ metadata_origin = OriginExtrinsicMetadataRow(
+ id=data.origin_url_1,
+ metadata=metadata,
+ indexer_configuration_id=tool_id,
+ mappings=["mapping1"],
+ from_remd_id=b"\x02" * 20,
+ )
+
+ # when
+ with pytest.raises(DuplicateId):
+ storage.origin_extrinsic_metadata_add([metadata_origin, metadata_origin])
+
+
class TestIndexerStorageIndexerConfiguration:
def test_indexer_configuration_add(
self, swh_indexer_storage_with_data: Tuple[IndexerStorageInterface, Any]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:25 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217082
Attached To
D8058: Add support for origin_extrinsic_metadata to the storage
Event Timeline
Log In to Comment