diff --git a/sql/upgrades/176.sql b/sql/upgrades/176.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/176.sql @@ -0,0 +1,27 @@ +-- SWH DB schema upgrade +-- from_version: 175 +-- to_version: 176 +-- description: add storage of the extid.extid_version field + +insert into dbversion(version, release, description) + values(176, now(), 'Work In Progress'); + +alter table extid add column extid_version bigint not null default 0; + +comment on column extid.extid_version is 'Version of the extid for the given original object'; + +create or replace function swh_extid_add() + returns void + language plpgsql +as $$ +begin + insert into extid (extid_type, extid, extid_version, target_type, target) + select distinct t.extid_type, t.extid, t.extid_version, t.target_type, t.target + from tmp_extid t + on conflict do nothing; + return; +end +$$; + +create unique index concurrently on extid(extid_type, extid, extid_version, target_type, target); +drop index extid_extid_type_extid_target_type_target_idx; diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py --- a/swh/storage/backfill.py +++ b/swh/storage/backfill.py @@ -80,7 +80,7 @@ "reason", ], "directory": ["id", "dir_entries", "file_entries", "rev_entries"], - "extid": ["extid_type", "extid", "target_type", "target"], + "extid": ["extid_type", "extid", "extid_version", "target_type", "target"], "metadata_authority": ["type", "url"], "metadata_fetcher": ["name", "version"], "origin": ["url"], diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -1178,15 +1178,29 @@ return (token, finalizer) @_prepared_select_statement( - ExtIDRow, "WHERE extid_type=? AND extid=? AND target_type=? AND target=?", + ExtIDRow, + "WHERE extid_type=? AND extid=? AND extid_version=? " + "AND target_type=? AND target=?", ) def extid_get_from_pk( - self, extid_type: str, extid: bytes, target: CoreSWHID, *, statement, + self, + extid_type: str, + extid: bytes, + extid_version: int, + target: CoreSWHID, + *, + statement, ) -> Optional[ExtIDRow]: rows = list( self._execute_with_retries( statement, - [extid_type, extid, target.object_type.value, target.object_id], + [ + extid_type, + extid, + extid_version, + target.object_type.value, + target.object_id, + ], ), ) assert len(rows) <= 1 diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -314,10 +314,11 @@ @dataclasses.dataclass class ExtIDRow(BaseRow): TABLE = "extid" - PARTITION_KEY = ("target", "target_type", "extid", "extid_type") + PARTITION_KEY = ("target", "target_type", "extid_version", "extid", "extid_type") extid_type: str extid: bytes + extid_version: int target_type: str target: bytes diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -277,9 +277,10 @@ CREATE TABLE IF NOT EXISTS extid ( extid_type ascii, extid blob, + extid_version smallint, target_type ascii, target blob, - PRIMARY KEY ((extid_type, extid), target_type, target) + PRIMARY KEY ((extid_type, extid), extid_version, target_type, target) );""", """ CREATE TABLE IF NOT EXISTS extid_by_target ( diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1455,7 +1455,10 @@ extid for extid in ids if not self._cql_runner.extid_get_from_pk( - extid_type=extid.extid_type, extid=extid.extid, target=extid.target, + extid_type=extid.extid_type, + extid_version=extid.extid_version, + extid=extid.extid, + target=extid.target, ) ] else: @@ -1469,6 +1472,7 @@ target = extid.target.object_id extidrow = ExtIDRow( extid_type=extid.extid_type, + extid_version=extid.extid_version, extid=extid.extid, target_type=target_type, target=target, @@ -1489,6 +1493,7 @@ result.extend( ExtID( extid_type=extidrow.extid_type, + extid_version=extidrow.extid_version, extid=extidrow.extid, target=CoreSWHID( object_type=extidrow.target_type, object_id=extidrow.target, @@ -1509,6 +1514,7 @@ result.extend( ExtID( extid_type=extidrow.extid_type, + extid_version=extidrow.extid_version, extid=extidrow.extid, target=CoreSWHID( object_type=SwhidObjectType(extidrow.target_type), diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -681,12 +681,13 @@ pass def extid_get_from_pk( - self, extid_type: str, extid: bytes, target: ExtendedSWHID, + self, extid_type: str, extid: bytes, extid_version: int, target: ExtendedSWHID, ) -> Optional[ExtIDRow]: primary_key = self._extid.primary_key_from_dict( dict( extid_type=extid_type, extid=extid, + extid_version=extid_version, target_type=target.object_type.value, target=target.object_id, ) diff --git a/swh/storage/postgresql/converters.py b/swh/storage/postgresql/converters.py --- a/swh/storage/postgresql/converters.py +++ b/swh/storage/postgresql/converters.py @@ -331,6 +331,7 @@ return ExtID( extid=row["extid"], extid_type=row["extid_type"], + extid_version=row.get("extid_version", 0), target=CoreSWHID( object_id=row["target"], object_type=SwhidObjectType[row["target_type"].upper()], diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -30,7 +30,7 @@ """ - current_version = 175 + current_version = 176 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( @@ -843,7 +843,7 @@ ((sortkey, id) for sortkey, id in enumerate(revisions)), ) - extid_cols = ["extid", "extid_type", "target", "target_type"] + extid_cols = ["extid", "extid_version", "extid_type", "target", "target_type"] def extid_get_from_extid_list(self, extid_type, ids, cur=None): cur = self._cursor(cur) diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -718,6 +718,7 @@ { "extid": extid.extid, "extid_type": extid.extid_type, + "extid_version": getattr(extid, "extid_version", 0), "target": extid.target.object_id, "target_type": extid.target.object_type.name.lower(), # arghh } diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(175, now(), 'Work In Progress'); + values(176, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -505,7 +505,8 @@ extid_type text not null, extid bytea not null, target_type object_type not null, - target sha1_git not null + target sha1_git not null, + extid_version bigint not null default 0 ); comment on table extid is 'Correspondance SWH object (SWHID) <-> original revision id (vcs id)'; @@ -513,3 +514,4 @@ comment on column extid.extid is 'Intrinsic identifier of the object (e.g. hg revision)'; comment on column extid.target_type is 'Type of SWHID of the referenced SWH object'; comment on column extid.target is 'Value (hash) of SWHID of the refenced SWH object'; +comment on column extid.extid_version is 'Version of the extid for the given original object'; diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -585,8 +585,8 @@ language plpgsql as $$ begin - insert into extid (extid_type, extid, target_type, target) - select distinct t.extid_type, t.extid, t.target_type, t.target + insert into extid (extid_type, extid, extid_version, target_type, target) + select distinct t.extid_type, t.extid, t.extid_version, t.target_type, t.target from tmp_extid t on conflict do nothing; return; diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -289,5 +289,5 @@ -- extid -- used to query by (extid_type, extid) + to deduplicate the whole row -create unique index concurrently on extid(extid_type, extid, target_type, target); +create unique index concurrently on extid(extid_type, extid, extid_version, target_type, target); create index concurrently on extid(target_type, target); diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -698,9 +698,18 @@ extid_type="directory", extid=b"something", ) + extid4 = ExtID( + target=CoreSWHID( + object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id + ), + extid_type="directory", + extid=b"something", + extid_version=2, + ) extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, + extid4, ) diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1183,6 +1183,7 @@ ExtID( extid=hgid, extid_type="hg", + extid_version=1, target=CoreSWHID(object_id=swhid, object_type=ObjectType.REVISION,), ) for hgid, swhid in zip(extids, swhids) @@ -1243,6 +1244,7 @@ ExtID( extid=extid, extid_type="git", + extid_version=2, target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), ) for extid in ids @@ -1254,6 +1256,7 @@ ExtID( extid=extid, extid_type="hg", + extid_version=2, target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), ) for extid in ids @@ -1300,6 +1303,42 @@ assert swh_storage.extid_get_from_target(ObjectType.REVISION, ids) == extids assert swh_storage.extid_get_from_target(ObjectType.RELEASE, ids) == extids2 + def test_extid_version_behavior(self, swh_storage, sample_data): + ids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + + # Insert extids with several different versions + extids = [ + ExtID( + extid=extid, + extid_type="git", + target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), + ) + for extid in ids + ] + [ + ExtID( + extid=extid, + extid_type="git", + extid_version=1, + target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), + ) + for extid in ids + ] + swh_storage.extid_add(extids) + + # Check that both versions get returned + for git_id in ids: + objs = swh_storage.extid_get_from_extid("git", [git_id]) + assert len(objs) == 2 + assert set(obj.extid_version for obj in objs) == {0, 1} + for swhid in ids: + objs = swh_storage.extid_get_from_target(ObjectType.REVISION, [swhid]) + assert len(objs) == 2 + assert set(obj.extid_version for obj in objs) == {0, 1} + def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2] diff --git a/swh/storage/tests/test_cassandra.py b/swh/storage/tests/test_cassandra.py --- a/swh/storage/tests/test_cassandra.py +++ b/swh/storage/tests/test_cassandra.py @@ -469,6 +469,7 @@ ExtIDRow( extid_type=extid.extid_type, extid=extid.extid, + extid_version=extid.extid_version, target_type=extid.target.object_type.value, target=extid.target.object_id, ) diff --git a/swh/storage/tests/test_storage_data.py b/swh/storage/tests/test_storage_data.py --- a/swh/storage/tests/test_storage_data.py +++ b/swh/storage/tests/test_storage_data.py @@ -24,6 +24,7 @@ "authorities", "origin_metadata", "content_metadata", + "extids", ]: for obj in getattr(data, attribute_key): assert isinstance(obj, BaseModel)