diff --git a/swh/storage/common.py b/swh/storage/common.py --- a/swh/storage/common.py +++ b/swh/storage/common.py @@ -3,9 +3,28 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.model.hashutil import MultiHash +from typing import Optional + +from swh.model.hashutil import MultiHash, hash_to_bytes +from swh.model.model import Revision def origin_url_to_sha1(origin_url: str) -> bytes: """Convert an origin URL to a sha1. Encodes URL to utf-8.""" return MultiHash.from_data(origin_url.encode("utf-8"), {"sha1"}).digest()["sha1"] + + +def revision_to_vcsid(revision: Revision) -> Optional[bytes]: + """Get the original VCS intrinsic identifier for the given revision""" + if revision.type.value == "git": + return revision.id + if revision.type.value == "hg": + node = None + if revision.extra_headers: + node = dict(revision.extra_headers).get(b"node") + if node is None and revision.metadata: + node = hash_to_bytes(revision.metadata.get("node")) + return node + + # fall back + return None # or revision.id ? diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -501,6 +501,22 @@ """ ... + @remote_api_endpoint("revision/from_vcs") + def revision_id_from_vcs( + self, vcs_type: str, vcs_ids: List[bytes] + ) -> List[Optional[Sha1Git]]: + """Get revisions ids from original VCS id + + Args: + vcs_type: type of the VCS the revisions comes from + vcs_ids: original VCS ids + + Returns: + list of revisions' Sha1Git (if the revision exists or None otherwise) + + """ + ... + @remote_api_endpoint("revision/log") def revision_log( self, revisions: List[Sha1Git], limit: Optional[int] = None diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -76,6 +76,10 @@ def revision_add_from_temp(self, cur=None): pass + @stored_procedure("swh_vcsid_add") + def vcsid_add_from_temp(self, cur=None): + pass + @stored_procedure("swh_release_add") def release_add_from_temp(self, cur=None): pass @@ -813,6 +817,31 @@ ((sortkey, id) for sortkey, id in enumerate(revisions)), ) + vcsid_cols = ["vcs", "vcs_id", "swhid_type", "swhid_value"] + + def swhid_get_from_vcsid_list(self, vcs_type, vcs_ids, cur=None): + cur = self._cursor(cur) + + query_keys = ", ".join( + self.mangle_query_key(k, "filtered_vcsid") for k in self.vcsid_cols + ) + sql = """ + WITH filtered_vcsid AS ( + SELECT * + FROM vcsid + WHERE vcs = '%s' + ) + SELECT %s FROM (VALUES %%s) as t(sortkey, id) + LEFT JOIN filtered_vcsid ON t.id = filtered_vcsid.vcs_id + ORDER BY sortkey + """ % ( + vcs_type, + query_keys, + ) + yield from execute_values_generator( + cur, sql, (((sortkey, id) for sortkey, id in enumerate(vcs_ids))), + ) + def revision_log(self, root_revisions, limit=None, cur=None): cur = self._cursor(cur) diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -28,6 +28,7 @@ MetadataAuthorityType, MetadataFetcher, MetadataTargetType, + ObjectType, Origin, OriginVisit, OriginVisitStatus, @@ -41,6 +42,7 @@ SnapshotBranch, TargetType, ) +from swh.storage.common import revision_to_vcsid from swh.storage.exc import HashCollision, StorageArgumentException, StorageDBError from swh.storage.interface import ( VISIT_STATUSES, @@ -585,6 +587,23 @@ cur, ) + # insert newly added revision IDs to the vcsid table + vcsid = [ + { + "vcs": revision.type.value, + "vcs_id": revision_to_vcsid(revision), + "swhid_type": ObjectType.REVISION.value, + "swhid_value": revision.id, + } + for revision in revisions_filtered + ] + db.mktemp("vcsid", cur) + + db.copy_to(vcsid, "tmp_vcsid", db.vcsid_cols, cur) + + # move metadata in place + db.vcsid_add_from_temp(cur) + return {"revision:add": len(revisions_missing)} @timed @@ -610,6 +629,17 @@ return revisions + @timed + @db_transaction(statement_timeout=1000) + def revision_id_from_vcs( + self, vcs_type: str, vcs_ids: List[bytes], db=None, cur=None + ) -> List[Optional[Sha1Git]]: + rev_ids = [] + for line in db.swhid_get_from_vcsid_list(vcs_type, vcs_ids, cur): + rev_ids.append(line[3]) + + return rev_ids + @timed @db_transaction_generator(statement_timeout=2000) def revision_log( diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -277,6 +277,22 @@ comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; +-- The original VCS <-> swhid relation table +create table vcsid +( + vcs revision_type not null, + vcs_id bytea not null, + swhid_type object_type not null, + swhid_value sha1_git not null +); + +comment on table vcsid is 'Correspondance SWH object (SWHID) <-> original revision id (vcs id)'; +comment on column vcsid.vcs is 'VCS type'; +comment on column vcsid.vcs_id is 'Intrinsic identifier of the revision in the original VCS'; +comment on column vcsid.swhid_type is 'Type of SWHID of the referenced SWH object'; +comment on column vcsid.swhid_value is 'Value (hash) of SWHID of the refenced SWH object'; + + -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a -- snapshot object capturing the full-state of the origin at visit time. diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -549,6 +549,22 @@ $$; +-- Create entries in vcsid from tmp_vcsid +-- operates in bulk: 0. swh_mktemp(vcsid), 1. COPY to tmp_vcsid, +-- 2. call this function +create or replace function swh_vcsid_add() + returns void + language plpgsql +as $$ +begin + insert into vcsid (vcs, vcs_id, swhid_type, swhid_value) + select distinct t.vcs, t.vcs_id, t.swhid_type, t.swhid_value + from tmp_vcsid t; + return; +end +$$; + + -- Create entries in person from tmp_release create or replace function swh_person_add_from_release() returns void diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -312,7 +312,139 @@ extra_headers=(), synthetic=False, ) - revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) + git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) + + hg_revision = Revision( + id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), + message=b"hello", + author=Person( + name=b"Nicolas Dandrimont", + email=b"nicolas@example.com", + fullname=b"Nicolas Dandrimont ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0), + offset=120, + negative_utc=False, + ), + committer=Person( + name=b"St\xc3fano Zacchiroli", + email=b"stefano@example.com", + fullname=b"St\xc3fano Zacchiroli ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=0), + offset=120, + negative_utc=False, + ), + parents=(), + type=RevisionType.MERCURIAL, + directory=directory.id, + metadata={ + "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, + "signed-off-by": "some-dude", + "node": "a316dfb434af2b451c1f393496b7eaeda343f543", + }, + extra_headers=(), + synthetic=True, + ) + hg_revision2 = Revision( + id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), + message=b"hello again", + author=Person( + name=b"Roberto Dicosmo", + email=b"roberto@example.com", + fullname=b"Roberto Dicosmo ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1123456789, microseconds=220000,), + offset=0, + negative_utc=False, + ), + parents=tuple([hg_revision.id]), + type=RevisionType.MERCURIAL, + directory=directory2.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), + ), + synthetic=False, + ) + hg_revision3 = Revision( + id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), + message=b"a simple revision with no parents this time", + author=Person( + name=b"Roberto Dicosmo", + email=b"roberto@example.com", + fullname=b"Roberto Dicosmo ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"tony", email=b"ar@dumont.fr", fullname=b"tony ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1127351742, microseconds=220000,), + offset=0, + negative_utc=False, + ), + parents=tuple([hg_revision.id, hg_revision2.id]), + type=RevisionType.MERCURIAL, + directory=directory2.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), + ), + synthetic=True, + ) + hg_revision4 = Revision( + id=hash_to_bytes("42070a39e5387e9b99bb3d83674e3a4a1ff39b69"), + message=b"parent of self.revision2", + author=Person( + name=b"me", email=b"me@soft.heri", fullname=b"me ", + ), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + committer=Person( + name=b"committer-dude", + email=b"committer@dude.com", + fullname=b"committer-dude ", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1244567843, microseconds=220000,), + offset=-720, + negative_utc=False, + ), + parents=tuple([hg_revision3.id]), + type=RevisionType.MERCURIAL, + directory=directory.id, + metadata=None, + extra_headers=( + (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), + ), + synthetic=False, + ) + hg_revisions: Tuple[Revision, ...] = ( + hg_revision, + hg_revision2, + hg_revision3, + hg_revision4, + ) + revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1051,6 +1051,32 @@ revision3.id, } + def test_revision_git_vcsid(self, swh_storage, sample_data): + + gitids = [ + revision.id + for revision in sample_data.revisions + if revision.type.value == "git" + ] + assert swh_storage.revision_id_from_vcs("git", gitids) == [None] * len(gitids) + + swh_storage.revision_add(sample_data.revisions) + assert swh_storage.revision_id_from_vcs("git", gitids) == gitids + assert swh_storage.revision_id_from_vcs("hg", gitids) == [None] * len(gitids) + + def test_revision_hg_vcsid(self, swh_storage, sample_data): + + hgids = [ + revision.id + for revision in sample_data.hg_revisions + if revision.type.value == "git" + ] + assert swh_storage.revision_id_from_vcs("hg", hgids) == [None] * len(hgids) + + swh_storage.revision_add(sample_data.hg_revisions) + assert swh_storage.revision_id_from_vcs("hg", hgids) == hgids + assert swh_storage.revision_id_from_vcs("git", hgids) == [None] * len(hgids) + def test_release_add(self, swh_storage, sample_data): release, release2 = sample_data.releases[:2]