diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -15,10 +15,12 @@ from swh.model.model import ( Content, Directory, + ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + ObjectType, Origin, OriginVisit, OriginVisitStatus, @@ -501,6 +503,52 @@ """ ... + @remote_api_endpoint("extid/from_extid") + def extid_get_from_extid( + self, id_type: str, ids: List[bytes] + ) -> List[Optional[ExtID]]: + """Get ExtID objects from external IDs + + Args: + id_type: type the external identifiers the SWH objects comes from + ids: list of external IDs + + Returns: + list of ExtID objects (if the ext ID is known, None otherwise) + + """ + ... + + @remote_api_endpoint("extid/from_swhid") + def extid_get_from_swhid( + self, target_type: ObjectType, ids: List[Sha1Git] + ) -> List[Optional[ExtID]]: + """Get ExtID objects from SWH IDs + + Args: + target_type: type the SWH object + ids: list of SWH IDs + + Returns: + list of ExtID objects (if the SWH ID is known, None otherwise) + + """ + ... + + @remote_api_endpoint("extid/add") + def extid_add(self, ids: List[ExtID]) -> Dict[str, int]: + """Add a series of ExtID objects + + Args: + ids: list of ExtID objects + + Returns: + Summary dict of keys with associated count as values + + extid:add: New ExtID objects actually stored in db + """ + ... + @remote_api_endpoint("revision/log") def revision_log( self, revisions: List[Sha1Git], limit: Optional[int] = None diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -76,6 +76,10 @@ def revision_add_from_temp(self, cur=None): pass + @stored_procedure("swh_extid_add") + def extid_add_from_temp(self, cur=None): + pass + @stored_procedure("swh_release_add") def release_add_from_temp(self, cur=None): pass @@ -813,6 +817,50 @@ ((sortkey, id) for sortkey, id in enumerate(revisions)), ) + extid_cols = ["extid", "extid_type", "target", "target_type"] + + def extid_get_from_extid_list(self, extid_type, ids, cur=None): + cur = self._cursor(cur) + + query_keys = ", ".join( + self.mangle_query_key(k, "extid") for k in self.extid_cols + ) + sql = """ + SELECT %s + FROM (VALUES %%s) as t(sortkey, extid, extid_type) + LEFT JOIN extid USING (extid, extid_type) + ORDER BY sortkey + """ % ( + query_keys, + ) + + yield from execute_values_generator( + cur, + sql, + (((sortkey, extid, extid_type) for sortkey, extid in enumerate(ids))), + ) + + def extid_get_from_swhid_list(self, target_type, ids, cur=None): + cur = self._cursor(cur) + + query_keys = ", ".join( + self.mangle_query_key(k, "extid") for k in self.extid_cols + ) + sql = """ + SELECT %s + FROM (VALUES %%s) as t(sortkey, target, target_type) + LEFT JOIN extid USING (target, target_type) + ORDER BY sortkey + """ % ( + query_keys, + ) + yield from execute_values_generator( + cur, + sql, + (((sortkey, target, target_type) for sortkey, target in enumerate(ids))), + template=b"(%s,%s,%s::object_type)", + ) + def revision_log(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -24,10 +24,12 @@ SHA1_SIZE, Content, Directory, + ExtID, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + ObjectType, Origin, OriginVisit, OriginVisitStatus, @@ -634,6 +636,77 @@ def revision_get_random(self, db=None, cur=None) -> Sha1Git: return db.revision_get_random(cur) + @timed + @db_transaction() + def extid_get_from_extid( + self, id_type: str, ids: List[bytes], db=None, cur=None + ) -> List[Optional[ExtID]]: + """Get ExtID objects from external IDs + + Args: + id_type: type the external identifiers the SWH objects comes from + ids: list of external IDs + + Returns: + list of ExtID objects (if the ext ID is known, None otherwise) + + """ + extids = [] + for row in db.extid_get_from_extid_list(id_type, ids, cur): + extids.append( + ExtID.from_dict(dict(zip(db.extid_cols, row))) + if row[0] is not None + else None + ) + return extids + + @timed + @db_transaction() + def extid_get_from_swhid( + self, target_type: ObjectType, ids: List[Sha1Git], db=None, cur=None + ) -> List[Optional[ExtID]]: + """Get ExtID objects from SWH IDs + + Args: + target_type: type the SWH object + ids: list of SWH IDs + + Returns: + list of ExtID objects (if the SWH ID is known, None otherwise) + + """ + extids = [] + for row in db.extid_get_from_swhid_list(target_type.value, ids, cur): + extids.append( + ExtID.from_dict(dict(zip(db.extid_cols, row))) + if row[0] is not None + else None + ) + return extids + + @timed + @db_transaction() + def extid_add(self, ids: List[ExtID], db=None, cur=None) -> Dict[str, int]: + """Add a series of ExtID objects + + Args: + ids: list of ExtID objects + + Returns: + Summary dict of keys with associated count as values + + extid:add: New ExtID objects actually stored in db + """ + extid = [extid.to_dict() for extid in ids] + db.mktemp("extid", cur) + + db.copy_to(extid, "tmp_extid", db.extid_cols, cur) + + # move metadata in place + db.extid_add_from_temp(cur) + + return {"extid:add": len(extid)} + @timed @process_metrics @db_transaction() diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -497,3 +497,19 @@ comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; + + +-- The ExtID (typ. original VCS) <-> swhid relation table +create table extid +( + extid_type text not null, + extid bytea not null, + target_type object_type not null, + target sha1_git not null +); + +comment on table extid is 'Correspondance SWH object (SWHID) <-> original revision id (vcs id)'; +comment on column extid.extid_type is 'ExtID type'; +comment on column extid.extid is 'Intrinsic identifier of the object (e.g. hg revision)'; +comment on column extid.target_type is 'Type of SWHID of the referenced SWH object'; +comment on column extid.target is 'Value (hash) of SWHID of the refenced SWH object'; diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -549,6 +549,23 @@ $$; +-- Create entries in extid from tmp_extid +-- operates in bulk: 0. swh_mktemp(extid), 1. COPY to tmp_extid, +-- 2. call this function +create or replace function swh_extid_add() + returns void + language plpgsql +as $$ +begin + insert into extid (extid_type, extid, target_type, target) + select distinct t.extid_type, t.extid, t.target_type, t.target + from tmp_extid t + on conflict do nothing; + return; +end +$$; + + -- Create entries in person from tmp_release create or replace function swh_person_add_from_release() returns void diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -281,3 +281,7 @@ -- object_counts_bucketed create unique index concurrently object_counts_bucketed_pkey on object_counts_bucketed(line); alter table object_counts_bucketed add primary key using index object_counts_bucketed_pkey; + +-- extid +create unique index concurrently on extid(extid_type, extid); +create unique index concurrently on extid(target_type, target); diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -312,7 +312,139 @@ extra_headers=(), synthetic=False, ) - revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) + git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) + + hg_revision = Revision( + id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), + message=b"hello", + author=Person( + name=b"Nicolas Dandrimont", + email=b"nicolas@example.com", + fullname=b"Nicolas Dandrimont ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0), + offset=120, + negative_utc=False, + ), + committer=Person( + name=b"St\xc3fano Zacchiroli", + email=b"stefano@example.com", + fullname=b"St\xc3fano Zacchiroli ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=0), + offset=120, + negative_utc=False, + ), + parents=(), + type=RevisionType.MERCURIAL, + directory=directory.id, + metadata={ + "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, + "signed-off-by": "some-dude", + "node": "a316dfb434af2b451c1f393496b7eaeda343f543", + }, + extra_headers=(), + synthetic=True, + ) + hg_revision2 = Revision( + id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), + message=b"hello again", + author=Person( + name=b"Roberto Dicosmo", + email=b"roberto@example.com", + fullname=b"Roberto Dicosmo ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=220000,), + offset=0, + negative_utc=False, + ), + parents=tuple([hg_revision.id]), + type=RevisionType.MERCURIAL, + directory=directory2.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), + ), + synthetic=False, + ) + hg_revision3 = Revision( + id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), + message=b"a simple revision with no parents this time", + author=Person( + name=b"Roberto Dicosmo", + email=b"roberto@example.com", + fullname=b"Roberto Dicosmo ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1127351742, microseconds=220000,), + offset=0, + negative_utc=False, + ), + parents=tuple([hg_revision.id, hg_revision2.id]), + type=RevisionType.MERCURIAL, + directory=directory2.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), + ), + synthetic=True, + ) + hg_revision4 = Revision( + id=hash_to_bytes("42070a39e5387e9b99bb3d83674e3a4a1ff39b69"), + message=b"parent of self.revision2", + author=Person( + name=b"me", email=b"me@soft.heri", fullname=b"me ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"committer-dude", + email=b"committer@dude.com", + fullname=b"committer-dude ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1244567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + parents=tuple([hg_revision3.id]), + type=RevisionType.MERCURIAL, + directory=directory.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), + ), + synthetic=False, + ) + hg_revisions: Tuple[Revision, ...] = ( + hg_revision, + hg_revision2, + hg_revision3, + hg_revision4, + ) + revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -23,7 +23,9 @@ from swh.model.model import ( Content, Directory, + ExtID, MetadataTargetType, + ObjectType, Origin, OriginVisit, OriginVisitStatus, @@ -1051,6 +1053,178 @@ revision3.id, } + def test_extid_add_git(self, swh_storage, sample_data): + + gitids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + nullids = [None] * len(gitids) + + assert swh_storage.extid_get_from_extid("git", gitids) == nullids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, gitids) == nullids + + extids = [ + ExtID( + extid=gitid, + extid_type="git", + target=gitid, + target_type=ObjectType.REVISION, + ) + for gitid in gitids + ] + summary = swh_storage.extid_add(extids) + assert summary == {"extid:add": len(gitids)} + + assert swh_storage.extid_get_from_extid("git", gitids) == extids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, gitids) == extids + + assert swh_storage.extid_get_from_extid("hg", gitids) == nullids + assert swh_storage.extid_get_from_swhid(ObjectType.RELEASE, gitids) == nullids + + def test_extid_add_hg(self, swh_storage, sample_data): + def get_node(revision): + node = None + if revision.extra_headers: + node = dict(revision.extra_headers).get(b"node") + if node is None and revision.metadata: + node = hash_to_bytes(revision.metadata.get("node")) + return node + + swhids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "hg" + ] + extids = [ + get_node(revision) + for revision in sample_data.revisions + if revision.type.value == "hg" + ] + nullids = [None] * len(swhids) + + assert swh_storage.extid_get_from_extid("hg", extids) == nullids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, swhids) == nullids + + extid_objs = [ + ExtID( + extid=hgid, + extid_type="hg", + target=swhid, + target_type=ObjectType.REVISION, + ) + for hgid, swhid in zip(extids, swhids) + ] + summary = swh_storage.extid_add(extid_objs) + assert summary == {"extid:add": len(swhids)} + + assert swh_storage.extid_get_from_extid("hg", extids) == extid_objs + assert ( + swh_storage.extid_get_from_swhid(ObjectType.REVISION, swhids) == extid_objs + ) + + assert swh_storage.extid_get_from_extid("git", extids) == nullids + assert swh_storage.extid_get_from_swhid(ObjectType.RELEASE, swhids) == nullids + + def test_extid_add_twice(self, swh_storage, sample_data): + + gitids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + + extids = [ + ExtID( + extid=gitid, + extid_type="git", + target=gitid, + target_type=ObjectType.REVISION, + ) + for gitid in gitids + ] + summary = swh_storage.extid_add(extids) + assert summary == {"extid:add": len(gitids)} + + # add them again, should be noop + summary = swh_storage.extid_add(extids) + # assert summary == {"extid:add": 0} + assert swh_storage.extid_get_from_extid("git", gitids) == extids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, gitids) == extids + + def test_extid_add_extid_unicity(self, swh_storage, sample_data): + + ids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + nullids = [None] * len(ids) + + extids = [ + ExtID( + extid=extid, + extid_type="git", + target=extid, + target_type=ObjectType.REVISION, + ) + for extid in ids + ] + swh_storage.extid_add(extids) + + # try to add "modified-extid" versions, should be noops + extids2 = [ + ExtID( + extid=extid, + extid_type="hg", + target=extid, + target_type=ObjectType.REVISION, + ) + for extid in ids + ] + swh_storage.extid_add(extids2) + + assert swh_storage.extid_get_from_extid("git", ids) == extids + assert swh_storage.extid_get_from_extid("hg", ids) == nullids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, ids) == extids + + def test_extid_add_target_unicity(self, swh_storage, sample_data): + + ids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + nullids = [None] * len(ids) + + extids = [ + ExtID( + extid=extid, + extid_type="git", + target=extid, + target_type=ObjectType.REVISION, + ) + for extid in ids + ] + swh_storage.extid_add(extids) + + # try to add "modified" versions, should be noops + extids2 = [ + ExtID( + extid=extid, + extid_type="git", + target=extid, + target_type=ObjectType.RELEASE, + ) + for extid in ids + ] + swh_storage.extid_add(extids2) + + assert swh_storage.extid_get_from_extid("git", ids) == extids + assert swh_storage.extid_get_from_swhid(ObjectType.REVISION, ids) == extids + assert swh_storage.extid_get_from_swhid(ObjectType.RELEASE, ids) == nullids + def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2]