diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py --- a/swh/storage/backfill.py +++ b/swh/storage/backfill.py @@ -80,7 +80,15 @@ "reason", ], "directory": ["id", "dir_entries", "file_entries", "rev_entries", "raw_manifest"], - "extid": ["extid_type", "extid", "extid_version", "target_type", "target"], + "extid": [ + "extid_type", + "extid", + "extid_version", + "target_type", + "target", + "payload_type", + "payload", + ], "metadata_authority": ["type", "url"], "metadata_fetcher": ["name", "version"], "origin": ["url"], diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -1352,6 +1352,8 @@ extid: bytes, extid_version: int, target: CoreSWHID, + payload_type: str, + payload: Sha1Git, *, statement, ) -> Optional[ExtIDRow]: @@ -1364,6 +1366,8 @@ extid_version, target.object_type.value, target.object_id, + payload_type, + payload, ], ), ) diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -317,13 +317,23 @@ @dataclasses.dataclass class ExtIDRow(BaseRow): TABLE = "extid" - PARTITION_KEY = ("target", "target_type", "extid_version", "extid", "extid_type") + PARTITION_KEY = ( + "target", + "target_type", + "extid_version", + "extid", + "extid_type", + "payload_type", + "payload", + ) extid_type: str extid: bytes extid_version: int target_type: str target: bytes + payload_type: Optional[str] + payload: Optional[bytes] @dataclasses.dataclass diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -278,7 +278,9 @@ extid_version smallint, target_type ascii, target blob, - PRIMARY KEY ((extid_type, extid), extid_version, target_type, target) + payload_type ascii, + payload blob, + PRIMARY KEY ((extid_type, extid), extid_version, target_type, target, payload_type, payload) );""", """ CREATE TABLE IF NOT EXISTS extid_by_target ( diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1663,6 +1663,8 @@ extid_version=extid.extid_version, extid=extid.extid, target=extid.target, + payload_type=extid.payload_type, + payload=extid.payload, ) ] else: @@ -1676,12 +1678,16 @@ target = extid.target.object_id extid_version = extid.extid_version extid_type = extid.extid_type + payload_type = extid.payload_type + payload = extid.payload extidrow = ExtIDRow( extid_type=extid_type, extid_version=extid_version, extid=extid.extid, target_type=target_type, target=target, + payload_type=payload_type, + payload=payload, ) (token, insertion_finalizer) = self._cql_runner.extid_add_prepare(extidrow) indexrow = ExtIDByTargetRow( @@ -1714,6 +1720,8 @@ object_type=extidrow.target_type, object_id=extidrow.target, ), + payload_type=extidrow.payload_type, + payload=extidrow.payload, ) for extidrow in extidrows ) @@ -1749,6 +1757,8 @@ object_type=SwhidObjectType(extidrow.target_type), object_id=extidrow.target, ), + payload_type=extidrow.payload_type, + payload=extidrow.payload, ) for extidrow in extidrows ) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -740,6 +740,8 @@ extid: bytes, extid_version: int, target: ExtendedSWHID, + payload_type: str, + payload: Sha1Git, ) -> Optional[ExtIDRow]: primary_key = self._extid.primary_key_from_dict( dict( @@ -748,6 +750,8 @@ extid_version=extid_version, target_type=target.object_type.value, target=target.object_id, + payload_type=payload_type, + payload=payload, ) ) return self._extid.get_from_primary_key(primary_key) diff --git a/swh/storage/postgresql/converters.py b/swh/storage/postgresql/converters.py --- a/swh/storage/postgresql/converters.py +++ b/swh/storage/postgresql/converters.py @@ -359,4 +359,6 @@ object_id=row["target"], object_type=SwhidObjectType[row["target_type"].upper()], ), + payload_type=row["payload_type"], + payload=row["payload"], ) diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -899,7 +899,15 @@ ((sortkey, id) for sortkey, id in enumerate(revisions)), ) - extid_cols = ["extid", "extid_version", "extid_type", "target", "target_type"] + extid_cols = [ + "extid", + "extid_version", + "extid_type", + "target", + "target_type", + "payload_type", + "payload", + ] def extid_get_from_extid_list( self, extid_type: str, ids: List[bytes], version: Optional[int] = None, cur=None diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -108,7 +108,7 @@ class Storage: """SWH storage datastore proxy, encompassing DB and object storage""" - current_version: int = 186 + current_version: int = 187 def __init__( self, @@ -807,6 +807,8 @@ "extid_version": getattr(extid, "extid_version", 0), "target": extid.target.object_id, "target_type": extid.target.object_type.name.lower(), # arghh + "payload_type": extid.payload_type, + "payload": extid.payload, } for extid in ids ] diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -506,7 +506,9 @@ extid bytea not null, target_type object_type not null, target sha1_git not null, - extid_version bigint not null default 0 + extid_version bigint not null default 0, + payload_type text, + payload sha1_git ); comment on table extid is 'Correspondance SWH object (SWHID) <-> original revision id (vcs id)'; @@ -515,3 +517,5 @@ comment on column extid.target_type is 'Type of SWHID of the referenced SWH object'; comment on column extid.target is 'Value (hash) of SWHID of the refenced SWH object'; comment on column extid.extid_version is 'Version of the extid type for the given original object'; +comment on column extid.payload_type is 'The type of the payload object'; +comment on column extid.payload is 'Content object ID of data associated with the ExtID'; diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -598,8 +598,8 @@ language plpgsql as $$ begin - insert into extid (extid_type, extid, extid_version, target_type, target) - select distinct t.extid_type, t.extid, t.extid_version, t.target_type, t.target + insert into extid (extid_type, extid, extid_version, target_type, target, payload_type, payload) + select distinct t.extid_type, t.extid, t.extid_version, t.target_type, t.target, t.payload_type, t.payload from tmp_extid t on conflict do nothing; return; diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -318,6 +318,14 @@ -- extid +-- the payload can be null if and only if the payload_type is null +alter table extid + add constraint extid_payload_check + check ((payload_type is null) = (payload is null)) + not valid; + -- used to query by (extid_type, extid) + to deduplicate the whole row -create unique index concurrently on extid(extid_type, extid, extid_version, target_type, target); +create unique index concurrently on extid(extid_type, extid, extid_version, target_type, target) where payload_type is null and payload is null; +create unique index concurrently on extid(extid_type, extid, extid_version, target_type, target, payload_type, payload) where payload_type is not null and payload is not null; + create index concurrently on extid(target_type, target); diff --git a/swh/storage/sql/upgrades/187.sql b/swh/storage/sql/upgrades/187.sql new file mode 100644 --- /dev/null +++ b/swh/storage/sql/upgrades/187.sql @@ -0,0 +1,51 @@ +-- SWH DB schema upgrade +-- from_version: 186 +-- to_version: 187 +-- description: Add payloads to ExtIDs + +insert into dbversion(version, release, description) + values(187, now(), 'Work In Progress'); + +-- Add payload_type and payload columns. + +alter table extid + add column payload_type text; +comment on column extid.payload_type is 'The type of the payload object'; + +alter table extid + add column payload sha1_git; +comment on column extid.payload is 'Content object ID of data associated with the ExtID'; + +-- Add payload constraint. + +alter table extid + add constraint extid_payload_check + check ((payload_type is null) = (payload is null)) + not valid; + +-- Update the unique index. + +create unique index concurrently extid_no_payload_idx + on extid(extid_type, extid, extid_version, target_type, target) + where payload_type is null and payload is null; + +create unique index concurrently extid_payload_idx + on extid(extid_type, extid, extid_version, target_type, target, payload_type, payload) + where payload_type is not null and payload is not null; + +drop index if exists extid_extid_type_extid_extid_version_target_type_target_idx; + +-- Update the swh_extid_add procedure to include the new columns. + +create or replace function swh_extid_add() + returns void + language plpgsql +as $$ +begin + insert into extid (extid_type, extid, extid_version, target_type, target, payload_type, payload) + select distinct t.extid_type, t.extid, t.extid_version, t.target_type, t.target, t.payload_type, t.payload + from tmp_extid t + on conflict do nothing; + return; +end +$$; diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -831,9 +831,20 @@ extid_version=2, ) + extid5 = ExtID( + target=CoreSWHID( + object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id + ), + extid_type="directory", + extid=b"something", + payload_type="test", + payload=content.sha1_git, + ) + extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, extid4, + extid5, ) diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1695,6 +1695,41 @@ ObjectType.REVISION, [ids[0]], extid_type="git" ) + def test_extid_payload(self, swh_storage, sample_data): + target = sample_data.directory2.swhid() + extids = [ + ExtID( + extid=b"abc123", + extid_type="test", + target=target, + payload_type="test_payload", + payload=sample_data.content.sha1_git, + ), + ] + + assert swh_storage.extid_get_from_extid("test", [b"abc123"]) == [] + assert ( + swh_storage.extid_get_from_target(target.object_type, [target.object_id]) + == [] + ) + + summary = swh_storage.extid_add(extids) + assert summary == {"extid:add": 1} + + assert swh_storage.extid_get_from_extid("test", [b"abc123"]) == extids + assert ( + swh_storage.extid_get_from_target(target.object_type, [target.object_id]) + == extids + ) + + # check ExtIDs have been added to the journal + extids_in_journal = [ + obj + for (obj_type, obj) in swh_storage.journal_writer.journal.objects + if obj_type == "extid" + ] + assert extids == extids_in_journal + def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2]