diff --git a/sql/upgrades/173.sql b/sql/upgrades/173.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/173.sql @@ -0,0 +1,14 @@ +-- SWH DB schema upgrade +-- from_version: 172 +-- to_version: 173 +-- description: remove unicity on (extid_type, extid) and (target_type, target) + +insert into dbversion(version, release, description) + values(173, now(), 'Work In Progress'); + +-- At the time this migratinon runs, the table is empty, +-- so no need to bother about performance +drop index extid_extid_type_extid_idx; +drop index extid_target_type_target_idx; +create unique index concurrently on extid(extid_type, extid, target_type, target); +create index concurrently on extid(target_type, target); diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1360,63 +1360,46 @@ target=extid.target.object_id, ) (token, insertion_finalizer) = self._cql_runner.extid_add_prepare(extidrow) - if ( - self.extid_get_from_extid(extid.extid_type, [extid.extid])[0] - or self.extid_get_from_target( - extid.target.object_type, [extid.target.object_id] - )[0] - ): - # on conflict do nothing... - continue self._cql_runner.extid_index_add_one(extidrow, token) insertion_finalizer() inserted += 1 return {"extid:add": inserted} - def extid_get_from_extid( - self, id_type: str, ids: List[bytes] - ) -> List[Optional[ExtID]]: - result: List[Optional[ExtID]] = [] + def extid_get_from_extid(self, id_type: str, ids: List[bytes]) -> List[ExtID]: + result: List[ExtID] = [] for extid in ids: extidrows = list(self._cql_runner.extid_get_from_extid(id_type, extid)) - assert len(extidrows) <= 1 - if extidrows: - result.append( - ExtID( - extid_type=extidrows[0].extid_type, - extid=extidrows[0].extid, - target=CoreSWHID( - object_type=extidrows[0].target_type, - object_id=extidrows[0].target, - ), - ) + result.extend( + ExtID( + extid_type=extidrow.extid_type, + extid=extidrow.extid, + target=CoreSWHID( + object_type=extidrow.target_type, object_id=extidrow.target, + ), ) - else: - result.append(None) + for extidrow in extidrows + ) return result def extid_get_from_target( self, target_type: SwhidObjectType, ids: List[Sha1Git] - ) -> List[Optional[ExtID]]: - result: List[Optional[ExtID]] = [] + ) -> List[ExtID]: + result: List[ExtID] = [] for target in ids: extidrows = list( self._cql_runner.extid_get_from_target(target_type.value, target) ) - assert len(extidrows) <= 1 - if extidrows: - result.append( - ExtID( - extid_type=extidrows[0].extid_type, - extid=extidrows[0].extid, - target=CoreSWHID( - object_type=SwhidObjectType(extidrows[0].target_type), - object_id=extidrows[0].target, - ), - ) + result.extend( + ExtID( + extid_type=extidrow.extid_type, + extid=extidrow.extid, + target=CoreSWHID( + object_type=SwhidObjectType(extidrow.target_type), + object_id=extidrow.target, + ), ) - else: - result.append(None) + for extidrow in extidrows + ) return result # Misc diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -502,9 +502,7 @@ ... @remote_api_endpoint("extid/from_extid") - def extid_get_from_extid( - self, id_type: str, ids: List[bytes] - ) -> List[Optional[ExtID]]: + def extid_get_from_extid(self, id_type: str, ids: List[bytes]) -> List[ExtID]: """Get ExtID objects from external IDs Args: @@ -512,7 +510,7 @@ ids: list of external IDs Returns: - list of ExtID objects (if the ext ID is known, None otherwise) + list of ExtID objects """ ... @@ -520,7 +518,7 @@ @remote_api_endpoint("extid/from_target") def extid_get_from_target( self, target_type: ObjectType, ids: List[Sha1Git] - ) -> List[Optional[ExtID]]: + ) -> List[ExtID]: """Get ExtID objects from target IDs and target_type Args: @@ -528,7 +526,7 @@ ids: list of target IDs Returns: - list of ExtID objects (if the SWH ID is known, None otherwise) + list of ExtID objects """ ... diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -29,7 +29,7 @@ """ - current_version = 172 + current_version = 173 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -643,28 +643,22 @@ @db_transaction() def extid_get_from_extid( self, id_type: str, ids: List[bytes], db=None, cur=None - ) -> List[Optional[ExtID]]: + ) -> List[ExtID]: extids = [] for row in db.extid_get_from_extid_list(id_type, ids, cur): - extids.append( - converters.db_to_extid(dict(zip(db.extid_cols, row))) - if row[0] is not None - else None - ) + if row[0] is not None: + extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids @timed @db_transaction() def extid_get_from_target( self, target_type: ObjectType, ids: List[Sha1Git], db=None, cur=None - ) -> List[Optional[ExtID]]: + ) -> List[ExtID]: extids = [] for row in db.extid_get_from_swhid_list(target_type.value, ids, cur): - extids.append( - converters.db_to_extid(dict(zip(db.extid_cols, row))) - if row[0] is not None - else None - ) + if row[0] is not None: + extids.append(converters.db_to_extid(dict(zip(db.extid_cols, row)))) return extids @timed diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(172, now(), 'Work In Progress'); + values(173, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -287,5 +287,7 @@ alter table object_counts_bucketed add primary key using index object_counts_bucketed_pkey; -- extid -create unique index concurrently on extid(extid_type, extid); -create unique index concurrently on extid(target_type, target); + +-- used to query by (extid_type, extid) + to deduplicate the whole row +create unique index concurrently on extid(extid_type, extid, target_type, target); +create index concurrently on extid(target_type, target); diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1090,7 +1090,6 @@ for revision in sample_data.revisions if revision.type.value == "git" ] - nullids = [None] * len(gitids) extids = [ ExtID( extid=gitid, @@ -1100,8 +1099,8 @@ for gitid in gitids ] - assert swh_storage.extid_get_from_extid("git", gitids) == nullids - assert swh_storage.extid_get_from_target(ObjectType.REVISION, gitids) == nullids + assert swh_storage.extid_get_from_extid("git", gitids) == [] + assert swh_storage.extid_get_from_target(ObjectType.REVISION, gitids) == [] summary = swh_storage.extid_add(extids) assert summary == {"extid:add": len(gitids)} @@ -1109,8 +1108,8 @@ assert swh_storage.extid_get_from_extid("git", gitids) == extids assert swh_storage.extid_get_from_target(ObjectType.REVISION, gitids) == extids - assert swh_storage.extid_get_from_extid("hg", gitids) == nullids - assert swh_storage.extid_get_from_target(ObjectType.RELEASE, gitids) == nullids + assert swh_storage.extid_get_from_extid("hg", gitids) == [] + assert swh_storage.extid_get_from_target(ObjectType.RELEASE, gitids) == [] def test_extid_add_hg(self, swh_storage, sample_data): def get_node(revision): @@ -1131,10 +1130,9 @@ for revision in sample_data.revisions if revision.type.value == "hg" ] - nullids = [None] * len(swhids) - assert swh_storage.extid_get_from_extid("hg", extids) == nullids - assert swh_storage.extid_get_from_target(ObjectType.REVISION, swhids) == nullids + assert swh_storage.extid_get_from_extid("hg", extids) == [] + assert swh_storage.extid_get_from_target(ObjectType.REVISION, swhids) == [] extid_objs = [ ExtID( @@ -1152,8 +1150,8 @@ swh_storage.extid_get_from_target(ObjectType.REVISION, swhids) == extid_objs ) - assert swh_storage.extid_get_from_extid("git", extids) == nullids - assert swh_storage.extid_get_from_target(ObjectType.RELEASE, swhids) == nullids + assert swh_storage.extid_get_from_extid("git", extids) == [] + assert swh_storage.extid_get_from_target(ObjectType.RELEASE, swhids) == [] def test_extid_add_twice(self, swh_storage, sample_data): @@ -1180,14 +1178,13 @@ assert swh_storage.extid_get_from_extid("git", gitids) == extids assert swh_storage.extid_get_from_target(ObjectType.REVISION, gitids) == extids - def test_extid_add_extid_unicity(self, swh_storage, sample_data): + def test_extid_add_extid_multicity(self, swh_storage, sample_data): ids = [ revision.id for revision in sample_data.revisions if revision.type.value == "git" ] - nullids = [None] * len(ids) extids = [ ExtID( @@ -1199,7 +1196,7 @@ ] swh_storage.extid_add(extids) - # try to add "modified-extid" versions, should be noops + # try to add "modified-extid" versions, should be added extids2 = [ ExtID( extid=extid, @@ -1211,42 +1208,44 @@ swh_storage.extid_add(extids2) assert swh_storage.extid_get_from_extid("git", ids) == extids - assert swh_storage.extid_get_from_extid("hg", ids) == nullids - assert swh_storage.extid_get_from_target(ObjectType.REVISION, ids) == extids + assert swh_storage.extid_get_from_extid("hg", ids) == extids2 + assert set(swh_storage.extid_get_from_target(ObjectType.REVISION, ids)) == { + *extids, + *extids2, + } - def test_extid_add_target_unicity(self, swh_storage, sample_data): + def test_extid_add_target_multicity(self, swh_storage, sample_data): ids = [ revision.id for revision in sample_data.revisions if revision.type.value == "git" ] - nullids = [None] * len(ids) extids = [ ExtID( extid=extid, extid_type="git", - target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), + target=CoreSWHID(object_id=extid, object_type=ObjectType.RELEASE,), ) for extid in ids ] swh_storage.extid_add(extids) - # try to add "modified" versions, should be noops + # try to add "modified" versions, should be added extids2 = [ ExtID( extid=extid, extid_type="git", - target=CoreSWHID(object_id=extid, object_type=ObjectType.RELEASE,), + target=CoreSWHID(object_id=extid, object_type=ObjectType.REVISION,), ) for extid in ids ] swh_storage.extid_add(extids2) - assert swh_storage.extid_get_from_extid("git", ids) == extids - assert swh_storage.extid_get_from_target(ObjectType.REVISION, ids) == extids - assert swh_storage.extid_get_from_target(ObjectType.RELEASE, ids) == nullids + assert set(swh_storage.extid_get_from_extid("git", ids)) == {*extids, *extids2} + assert swh_storage.extid_get_from_target(ObjectType.REVISION, ids) == extids2 + assert swh_storage.extid_get_from_target(ObjectType.RELEASE, ids) == extids def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2]