diff --git a/sql/upgrades/182.sql b/sql/upgrades/182.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/182.sql @@ -0,0 +1,50 @@ +-- SWH DB schema upgrade +-- from_version: 181 +-- to_version: 182 +-- description: add displayname field to person table + +insert into dbversion(version, release, description) + values(182, now(), 'Work In Progress'); + +alter table person add column displayname bytea; + +comment on table person is 'Person, referenced in Revision author/committer or Release author'; +comment on column person.id is 'Internal id'; +comment on column person.name is 'Name (advisory, only present if parsed from fullname)'; +comment on column person.email is 'Email (advisory, only present if parsed from fullname)'; +comment on column person.fullname is 'Full name, usually of the form `Name `, ' + 'used in integrity computations'; +comment on column person.displayname is 'Full name, usually of the form `Name `, ' + 'used for display queries'; + +create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL, ignore_displayname boolean default false) + returns setof revision_entry + language sql + stable +as $$ + -- when name and email are null, swh.storage.postgresql.converters.db_to_author() + -- parses fullname to populate them, so we can just drop them here + select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, + r.type, r.directory, r.message, + a.id, + case when ignore_displayname or a.displayname is null then a.fullname else a.displayname end, + case when ignore_displayname or a.displayname is null then a.name else null end, + case when ignore_displayname or a.displayname is null then a.email else null end, + c.id, + case when ignore_displayname or c.displayname is null then c.fullname else c.displayname end, + case when ignore_displayname or c.displayname is null then c.name else null end, + case when ignore_displayname or c.displayname is null then c.email else null end, + r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers, + r.raw_manifest + from swh_revision_list(root_revisions, num_revs) as t + left join revision r on t.id = r.id + left join person a on a.id = r.author + left join person c on c.id = r.committer; +$$; + +comment on function swh_revision_log(root_revisions bytea[], num_revs bigint, ignore_displayname boolean) + is '"git style" revision log. Similar to swh_revision_list(), but returning ' + 'all information associated to each revision, and expanding authors/committers'; + +drop function if exists swh_revision_log(root_revisions bytea[], num_revs bigint); diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -740,7 +740,9 @@ return self._cql_runner.revision_missing(revisions) @timed - def revision_get(self, revision_ids: List[Sha1Git]) -> List[Optional[Revision]]: + def revision_get( + self, revision_ids: List[Sha1Git], ignore_displayname: bool = False + ) -> List[Optional[Revision]]: rows = self._cql_runner.revision_get(revision_ids) revisions: Dict[Sha1Git, Revision] = {} for row in rows: @@ -808,7 +810,10 @@ @timed def revision_log( - self, revisions: List[Sha1Git], limit: Optional[int] = None + self, + revisions: List[Sha1Git], + ignore_displayname: bool = False, + limit: Optional[int] = None, ) -> Iterable[Optional[Dict[str, Any]]]: seen: Set[Sha1Git] = set() yield from self._get_parent_revs(revisions, seen, limit, False) @@ -846,7 +851,9 @@ return self._cql_runner.release_missing(releases) @timed - def release_get(self, releases: List[Sha1Git]) -> List[Optional[Release]]: + def release_get( + self, releases: List[Sha1Git], ignore_displayname: bool = False + ) -> List[Optional[Release]]: rows = self._cql_runner.release_get(releases) rels: Dict[Sha1Git, Release] = {} for row in rows: diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -348,7 +348,9 @@ if self._revisions.get_from_primary_key((id_,)) is not None: yield id_ - def revision_get(self, revision_ids: List[Sha1Git]) -> Iterable[RevisionRow]: + def revision_get( + self, revision_ids: List[Sha1Git], ignore_displayname: bool = False + ) -> Iterable[RevisionRow]: for id_ in revision_ids: row = self._revisions.get_from_primary_key((id_,)) if row: @@ -383,7 +385,9 @@ self._releases.insert(release) self.increment_counter("release", 1) - def release_get(self, release_ids: List[str]) -> Iterable[ReleaseRow]: + def release_get( + self, release_ids: List[str], ignore_displayname: bool = False + ) -> Iterable[ReleaseRow]: for id_ in release_ids: row = self._releases.get_from_primary_key((id_,)) if row: diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -532,11 +532,15 @@ ... @remote_api_endpoint("revision") - def revision_get(self, revision_ids: List[Sha1Git]) -> List[Optional[Revision]]: + def revision_get( + self, revision_ids: List[Sha1Git], ignore_displayname: bool = False + ) -> List[Optional[Revision]]: """Get revisions from storage Args: revisions: revision ids + ignore_displayname: return the original author/committer's full name even if + it's masked by a displayname. Returns: list of revision object (if the revision exists or None otherwise) @@ -604,12 +608,17 @@ @remote_api_endpoint("revision/log") def revision_log( - self, revisions: List[Sha1Git], limit: Optional[int] = None + self, + revisions: List[Sha1Git], + ignore_displayname: bool = False, + limit: Optional[int] = None, ) -> Iterable[Optional[Dict[str, Any]]]: """Fetch revision entry from the given root revisions. Args: revisions: array of root revisions to lookup + ignore_displayname: return the original author/committer's full name even if + it's masked by a displayname. limit: limitation on the output result. Default to None. Yields: @@ -686,11 +695,15 @@ ... @remote_api_endpoint("release") - def release_get(self, releases: List[Sha1Git]) -> List[Optional[Release]]: + def release_get( + self, releases: List[Sha1Git], ignore_displayname: bool = False + ) -> List[Optional[Release]]: """Given a list of sha1, return the releases's information Args: releases: list of sha1s + ignore_displayname: return the original author's full name even if it's + masked by a displayname. Returns: List of releases matching the identifiers or None if the release does diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -30,7 +30,7 @@ """ - current_version = 181 + current_version = 182 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( @@ -814,7 +814,7 @@ return cur.fetchone() @staticmethod - def mangle_query_key(key, main_table): + def mangle_query_key(key, main_table, ignore_displayname=False): if key == "id": return "t.id" if key == "parents": @@ -825,25 +825,45 @@ WHERE rh.id = t.id ORDER BY rh.parent_rank )""" + if "_" not in key: - return "%s.%s" % (main_table, key) + return f"{main_table}.{key}" head, tail = key.split("_", 1) - if head in ("author", "committer") and tail in ( + if head not in ("author", "committer") or tail not in ( "name", "email", "id", "fullname", ): - return "%s.%s" % (head, tail) + return f"{main_table}.{key}" - return "%s.%s" % (main_table, key) + if ignore_displayname: + return f"{head}.{tail}" + else: + if tail == "id": + return f"{head}.{tail}" + elif tail in ("name", "email"): + # These fields get populated again from fullname by + # converters.db_to_author if they're None, so we can just NULLify them + # when displayname is set. + return ( + f"CASE" + f" WHEN {head}.displayname IS NULL THEN {head}.{tail} " + f" ELSE NULL " + f"END AS {key}" + ) + elif tail == "fullname": + return f"COALESCE({head}.displayname, {head}.fullname) AS {key}" - def revision_get_from_list(self, revisions, cur=None): + assert False, "All cases should have been handled here" + + def revision_get_from_list(self, revisions, ignore_displayname=False, cur=None): cur = self._cursor(cur) query_keys = ", ".join( - self.mangle_query_key(k, "revision") for k in self.revision_get_cols + self.mangle_query_key(k, "revision", ignore_displayname) + for k in self.revision_get_cols ) yield from execute_values_generator( @@ -924,16 +944,20 @@ template=b"(%s,%s,%s::object_type)", ) - def revision_log(self, root_revisions, limit=None, cur=None): + def revision_log( + self, root_revisions, ignore_displayname=False, limit=None, cur=None + ): cur = self._cursor(cur) - query = """SELECT %s - FROM swh_revision_log(%%s, %%s) - """ % ", ".join( + query = """\ + SELECT %s + FROM swh_revision_log( + "root_revisions" := %%s, num_revs := %%s, "ignore_displayname" := %%s + )""" % ", ".join( self.revision_get_cols ) - cur.execute(query, (root_revisions, limit)) + cur.execute(query, (root_revisions, limit, ignore_displayname)) yield from cur revision_shortlog_cols = ["id", "parents"] @@ -1237,10 +1261,11 @@ cur.execute(query) yield from map(lambda row: row[0], cur) - def release_get_from_list(self, releases, cur=None): + def release_get_from_list(self, releases, ignore_displayname=False, cur=None): cur = self._cursor(cur) query_keys = ", ".join( - self.mangle_query_key(k, "release") for k in self.release_get_cols + self.mangle_query_key(k, "release", ignore_displayname) + for k in self.release_get_cols ) yield from execute_values_generator( diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -683,10 +683,15 @@ @timed @db_transaction(statement_timeout=1000) def revision_get( - self, revision_ids: List[Sha1Git], *, db: Db, cur=None + self, + revision_ids: List[Sha1Git], + ignore_displayname: bool = False, + *, + db: Db, + cur=None, ) -> List[Optional[Revision]]: revisions = [] - for line in db.revision_get_from_list(revision_ids, cur): + for line in db.revision_get_from_list(revision_ids, ignore_displayname, cur): revision = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) revisions.append(revision) @@ -695,9 +700,17 @@ @timed @db_transaction_generator(statement_timeout=2000) def revision_log( - self, revisions: List[Sha1Git], limit: Optional[int] = None, *, db: Db, cur=None + self, + revisions: List[Sha1Git], + ignore_displayname: bool = False, + limit: Optional[int] = None, + *, + db: Db, + cur=None, ) -> Iterable[Optional[Dict[str, Any]]]: - for line in db.revision_log(revisions, limit, cur): + for line in db.revision_log( + revisions, ignore_displayname=ignore_displayname, limit=limit, cur=cur + ): data = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) if not data: yield None @@ -831,10 +844,15 @@ @timed @db_transaction(statement_timeout=500) def release_get( - self, releases: List[Sha1Git], *, db: Db, cur=None + self, + releases: List[Sha1Git], + ignore_displayname: bool = False, + *, + db: Db, + cur=None, ) -> List[Optional[Release]]: rels = [] - for release in db.release_get_from_list(releases, cur): + for release in db.release_get_from_list(releases, ignore_displayname, cur): data = converters.db_to_release(dict(zip(db.release_get_cols, release))) rels.append(data if data else None) return rels diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(181, now(), 'Work In Progress'); + values(182, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -202,18 +202,21 @@ -- release metadata. create table person ( - id bigserial, - name bytea, -- advisory: not null if we managed to parse a name - email bytea, -- advisory: not null if we managed to parse an email - fullname bytea not null -- freeform specification; what is actually used in the checksums - -- will usually be of the form 'name ' + id bigserial, + name bytea, + email bytea, + fullname bytea not null, + displayname bytea ); -comment on table person is 'Person referenced in code artifact release metadata'; -comment on column person.id is 'Person identifier'; -comment on column person.name is 'Name'; -comment on column person.email is 'Email'; -comment on column person.fullname is 'Full name (raw name)'; +comment on table person is 'Person, referenced in Revision author/committer or Release author'; +comment on column person.id is 'Internal id'; +comment on column person.name is 'Name (advisory, only present if parsed from fullname)'; +comment on column person.email is 'Email (advisory, only present if parsed from fullname)'; +comment on column person.fullname is 'Full name, usually of the form `Name `, ' + 'used in integrity computations'; +comment on column person.displayname is 'Full name, usually of the form `Name `, ' + 'used for display queries'; -- The state of a source code tree at a specific point in time. diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -501,18 +501,24 @@ ); --- "git style" revision log. Similar to swh_revision_list(), but returning all --- information associated to each revision, and expanding authors/committers -create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL) +create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL, ignore_displayname boolean default false) returns setof revision_entry language sql stable as $$ + -- when name and email are null, swh.storage.postgresql.converters.db_to_author() + -- parses fullname to populate them, so we can just drop them here select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, r.type, r.directory, r.message, - a.id, a.fullname, a.name, a.email, - c.id, c.fullname, c.name, c.email, + a.id, + case when ignore_displayname or a.displayname is null then a.fullname else a.displayname end, + case when ignore_displayname or a.displayname is null then a.name else null end, + case when ignore_displayname or a.displayname is null then a.email else null end, + c.id, + case when ignore_displayname or c.displayname is null then c.fullname else c.displayname end, + case when ignore_displayname or c.displayname is null then c.name else null end, + case when ignore_displayname or c.displayname is null then c.email else null end, r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers, r.raw_manifest from swh_revision_list(root_revisions, num_revs) as t @@ -521,6 +527,9 @@ left join person c on c.id = r.committer; $$; +comment on function swh_revision_log(root_revisions bytea[], num_revs bigint, ignore_displayname boolean) + is '"git style" revision log. Similar to swh_revision_list(), but returning ' + 'all information associated to each revision, and expanding authors/committers'; -- Detailed entry for a release create type release_entry as diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1223,7 +1223,7 @@ # revision4 -is-child-of-> revision3 swh_storage.revision_add([revision3, revision4]) - results = list(swh_storage.revision_log([revision4.id], 1)) + results = list(swh_storage.revision_log([revision4.id], limit=1)) actual_results = [Revision.from_dict(r) for r in results] assert len(actual_results) == 1 @@ -1628,7 +1628,7 @@ release, synthetic=False, metadata=None, - author=attr.evolve(release.author, name=None, email=None) + author=Person.from_fullname(release.author.fullname) if release.author else None, ) diff --git a/swh/storage/tests/test_postgresql.py b/swh/storage/tests/test_postgresql.py --- a/swh/storage/tests/test_postgresql.py +++ b/swh/storage/tests/test_postgresql.py @@ -11,6 +11,7 @@ import attr import pytest +from swh.model.model import Person from swh.storage.postgresql.db import Db from swh.storage.tests.storage_tests import TestStorage as _TestStorage from swh.storage.tests.storage_tests import TestStorageGeneratedData # noqa @@ -252,6 +253,134 @@ "Content too long", ) + def test_revision_get_displayname_behavior(self, swh_storage, sample_data): + """Check revision_get behavior when displayname is set""" + revision, revision2 = sample_data.revisions[:2] + + # Make authors and committers known + revision = attr.evolve( + revision, + author=Person.from_fullname(b"author1 "), + committer=Person.from_fullname(b"committer1 "), + ) + revision = attr.evolve(revision, id=revision.compute_hash()) + revision2 = attr.evolve( + revision2, + author=Person.from_fullname(b"author2 "), + committer=Person.from_fullname(b"committer2 "), + ) + revision2 = attr.evolve(revision2, id=revision2.compute_hash()) + + add_result = swh_storage.revision_add([revision, revision2]) + assert add_result == {"revision:add": 2} + + # Before displayname change + revisions = swh_storage.revision_get([revision.id, revision2.id]) + assert revisions == [revision, revision2] + + displayname = b"Display Name " + + with db_transaction(swh_storage) as (_, cur): + cur.execute( + "UPDATE person set displayname = %s where fullname = %s", + (displayname, revision.author.fullname), + ) + + revisions = swh_storage.revision_get([revision.id, revision2.id]) + assert revisions == [ + attr.evolve(revision, author=Person.from_fullname(displayname)), + revision2, + ] + + revisions = swh_storage.revision_get( + [revision.id, revision2.id], ignore_displayname=True + ) + assert revisions == [revision, revision2] + + def test_revision_log_displayname_behavior(self, swh_storage, sample_data): + """Check revision_log behavior when displayname is set""" + revision, revision2 = sample_data.revisions[:2] + + # Make authors, committers and parenthood relationship known + # (revision2 -[parent]-> revision1) + revision = attr.evolve( + revision, + author=Person.from_fullname(b"author1 "), + committer=Person.from_fullname(b"committer1 "), + ) + revision = attr.evolve(revision, id=revision.compute_hash()) + revision2 = attr.evolve( + revision2, + parents=(revision.id,), + author=Person.from_fullname(b"author2 "), + committer=Person.from_fullname(b"committer2 "), + ) + revision2 = attr.evolve(revision2, id=revision2.compute_hash()) + + add_result = swh_storage.revision_add([revision, revision2]) + assert add_result == {"revision:add": 2} + + # Before displayname change + revisions = swh_storage.revision_log([revision2.id]) + assert list(revisions) == [revision2.to_dict(), revision.to_dict()] + + displayname = b"Display Name " + + with db_transaction(swh_storage) as (_, cur): + cur.execute( + "UPDATE person set displayname = %s where fullname = %s", + (displayname, revision.author.fullname), + ) + + revisions = swh_storage.revision_log([revision2.id]) + assert list(revisions) == [ + revision2.to_dict(), + attr.evolve(revision, author=Person.from_fullname(displayname)).to_dict(), + ] + + revisions = swh_storage.revision_log([revision2.id], ignore_displayname=True) + assert list(revisions) == [revision2.to_dict(), revision.to_dict()] + + def test_release_get_displayname_behavior(self, swh_storage, sample_data): + """Check release_get behavior when displayname is set""" + release, release2 = sample_data.releases[:2] + + # Make authors known + release = attr.evolve( + release, author=Person.from_fullname(b"author1 "), + ) + release = attr.evolve(release, id=release.compute_hash()) + release2 = attr.evolve( + release2, author=Person.from_fullname(b"author2 "), + ) + release2 = attr.evolve(release2, id=release2.compute_hash()) + + add_result = swh_storage.release_add([release, release2]) + assert add_result == {"release:add": 2} + + # Before displayname change + releases = swh_storage.release_get([release.id, release2.id]) + assert releases == [release, release2] + + displayname = b"Display Name " + + with db_transaction(swh_storage) as (_, cur): + cur.execute( + "UPDATE person set displayname = %s where fullname = %s", + (displayname, release.author.fullname), + ) + + releases = swh_storage.release_get([release.id, release2.id]) + assert releases == [ + attr.evolve(release, author=Person.from_fullname(displayname)), + release2, + ] + + releases = swh_storage.release_get( + [release.id, release2.id], ignore_displayname=True + ) + assert releases == [release, release2] + def test_clear_buffers(self, swh_storage): """Calling clear buffers on real storage does nothing