diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -24,6 +24,9 @@ [mypy-django.*] ignore_missing_imports = True +[mypy-msgpack.*] +ignore_missing_imports = True + [mypy-multiprocessing.util] ignore_missing_imports = True diff --git a/requirements-swh-journal.txt b/requirements-swh-journal.txt --- a/requirements-swh-journal.txt +++ b/requirements-swh-journal.txt @@ -1 +1 @@ -swh.journal >= 0.3.2 +swh.journal >= 0.4 diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ -swh.core[db,http] >= 0.0.94 -swh.model >= 0.3.4 +swh.core[db,http] >= 0.1.0 +swh.model >= 0.4.0 swh.objstorage >= 0.0.40 diff --git a/sql/upgrades/158.sql b/sql/upgrades/158.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/158.sql @@ -0,0 +1,101 @@ +-- SWH DB schema upgrade +-- from_version: 157 +-- to_version: 158 +-- description: Add the extra_headers column in the revision table + +-- latest schema version +insert into dbversion(version, release, description) + values(158, now(), 'Work Still In Progress'); + +-- Adapt the revision table for new extra_headers +alter table revision add column (extra_headers bytea[][]); + +drop type revision_entry cascade; +create type revision_entry as +( + id sha1_git, + date timestamptz, + date_offset smallint, + date_neg_utc_offset boolean, + committer_date timestamptz, + committer_date_offset smallint, + committer_date_neg_utc_offset boolean, + type revision_type, + directory sha1_git, + message bytea, + author_id bigint, + author_fullname bytea, + author_name bytea, + author_email bytea, + committer_id bigint, + committer_fullname bytea, + committer_name bytea, + committer_email bytea, + metadata jsonb, + synthetic boolean, + extra_headers bytea[][], + parents bytea[], + object_id bigint +); + +-- Create entries in revision from tmp_revision +create or replace function swh_revision_add() + returns void + language plpgsql +as $$ +begin + perform swh_person_add_from_revision(); + + insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers) + select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers + from tmp_revision t + left join person a on a.fullname = t.author_fullname + left join person c on c.fullname = t.committer_fullname; + return; +end +$$; + +-- "git style" revision log. Similar to swh_revision_list(), but returning all +-- information associated to each revision, and expanding authors/committers +create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL) + returns setof revision_entry + language sql + stable +as $$ + select t.id, r.date, r.date_offset, r.date_neg_utc_offset, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, + r.type, r.directory, r.message, + a.id, a.fullname, a.name, a.email, + c.id, c.fullname, c.name, c.email, + r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id + from swh_revision_list(root_revisions, num_revs) as t + left join revision r on t.id = r.id + left join person a on a.id = r.author + left join person c on c.id = r.committer; +$$; + +create or replace function swh_revision_list_by_object_id( + min_excl bigint, + max_incl bigint +) + returns setof revision_entry + language sql + stable +as $$ + with revs as ( + select * from revision + where object_id > min_excl and object_id <= max_incl + ) + select r.id, r.date, r.date_offset, r.date_neg_utc_offset, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, + r.type, r.directory, r.message, + a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers, + array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) + as parents, r.object_id + from revs r + left join person a on a.id = r.author + left join person c on c.id = r.committer + order by r.object_id; +$$; + +-- TODO: add the migration magic query... diff --git a/swh/storage/cassandra/converters.py b/swh/storage/cassandra/converters.py --- a/swh/storage/cassandra/converters.py +++ b/swh/storage/cassandra/converters.py @@ -22,7 +22,6 @@ ) from swh.model.hashutil import DEFAULT_ALGORITHMS -from ..converters import git_headers_to_db, db_to_git_headers from .common import Row @@ -33,11 +32,11 @@ # non-recursively convert it as a dict but make a deep copy. db_revision = deepcopy(attr.asdict(revision, recurse=False)) metadata = revision.metadata - if metadata and "extra_headers" in metadata: - db_revision["metadata"]["extra_headers"] = git_headers_to_db( - metadata["extra_headers"] - ) + extra_headers = revision.extra_headers + if not extra_headers and metadata and "extra_headers" in metadata: + extra_headers = db_revision["metadata"].pop("extra_headers") db_revision["metadata"] = json.dumps(db_revision["metadata"]) + db_revision["extra_headers"] = extra_headers db_revision["type"] = db_revision["type"].value return db_revision @@ -45,13 +44,16 @@ def revision_from_db(db_revision: Row, parents: Tuple[Sha1Git]) -> Revision: revision = db_revision._asdict() # type: ignore metadata = json.loads(revision.pop("metadata", None)) - if metadata and "extra_headers" in metadata: - extra_headers = db_to_git_headers(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers + extra_headers = revision.pop("extra_headers", ()) + if not extra_headers and metadata and "extra_headers" in metadata: + extra_headers = metadata.pop("extra_headers") + if extra_headers is None: + extra_headers = () return Revision( parents=parents, type=RevisionType(revision.pop("type")), metadata=metadata, + extra_headers=extra_headers, **revision, ) diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -425,6 +425,7 @@ "committer", "synthetic", "metadata", + "extra_headers", ] @_prepared_exists_statement("revision") diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -97,9 +97,10 @@ committer person, synthetic boolean, -- true iff revision has been created by Software Heritage - metadata text - -- extra metadata as JSON(tarball checksums, - -- extra commit information, etc...) + metadata text, + -- extra metadata as JSON(tarball checksums, etc...) + extra_headers frozen> > + -- extra commit information as (tuple(key, value), ...) ); diff --git a/swh/storage/converters.py b/swh/storage/converters.py --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -7,7 +7,7 @@ from typing import Optional, Dict -from swh.core.utils import decode_with_escape, encode_with_unescape +from swh.core.utils import encode_with_unescape from swh.model import identifiers from swh.model.hashutil import MultiHash @@ -64,31 +64,10 @@ } -def git_headers_to_db(git_headers): - """Convert git headers to their database representation. - - We convert the bytes to unicode by decoding them into utf-8 and replacing - invalid utf-8 sequences with backslash escapes. - - """ - ret = [] - for key, values in git_headers: - if isinstance(values, list): - ret.append([key, [decode_with_escape(value) for value in values]]) - else: - ret.append([key, decode_with_escape(values)]) - - return ret - - def db_to_git_headers(db_git_headers): ret = [] - for key, values in db_git_headers: - if isinstance(values, list): - ret.append([key, [encode_with_unescape(value) for value in values]]) - else: - ret.append([key, encode_with_unescape(values)]) - + for key, value in db_git_headers: + ret.append([key.encode("utf-8"), encode_with_unescape(value)]) return ret @@ -168,13 +147,6 @@ committer = author_to_db(revision["committer"]) committer_date = date_to_db(revision["committer_date"]) - metadata = revision["metadata"] - - if metadata and "extra_headers" in metadata: - metadata = metadata.copy() - extra_headers = git_headers_to_db(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers - return { "id": revision["id"], "author_fullname": author["fullname"], @@ -192,8 +164,9 @@ "type": revision["type"], "directory": revision["directory"], "message": revision["message"], - "metadata": metadata, + "metadata": revision["metadata"], "synthetic": revision["synthetic"], + "extra_headers": revision["extra_headers"], "parents": [ {"id": revision["id"], "parent_id": parent, "parent_rank": i,} for i, parent in enumerate(revision["parents"]) @@ -227,18 +200,17 @@ db_revision["committer_date_neg_utc_offset"], ) - metadata = db_revision["metadata"] - - if metadata and "extra_headers" in metadata: - extra_headers = db_to_git_headers(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers - parents = [] if "parents" in db_revision: for parent in db_revision["parents"]: if parent: parents.append(parent) + metadata = db_revision["metadata"] + extra_headers = db_revision.get("extra_headers", ()) + if not extra_headers and metadata and "extra_headers" in metadata: + extra_headers = db_to_git_headers(metadata.pop("extra_headers")) + ret = { "id": db_revision["id"], "author": author, @@ -250,6 +222,7 @@ "message": db_revision["message"], "metadata": metadata, "synthetic": db_revision["synthetic"], + "extra_headers": extra_headers, "parents": parents, } diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -423,6 +423,7 @@ "committer_email", "metadata", "synthetic", + "extra_headers", ] revision_get_cols = revision_add_cols + ["parents"] diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -239,7 +239,8 @@ metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) object_id bigserial, date_neg_utc_offset boolean, - committer_date_neg_utc_offset boolean + committer_date_neg_utc_offset boolean, + extra_headers bytea[][] -- extra headers (used in hash computation) ); comment on table revision is 'A revision represents the state of a source code tree at a specific point in time'; @@ -258,6 +259,7 @@ comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage'; comment on column revision.metadata is 'Extra revision metadata'; comment on column revision.object_id is 'Non-intrinsic, sequential object identifier'; +comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation'; -- either this table or the sha1_git[] column on the revision table diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -454,6 +454,7 @@ committer_email bytea, metadata jsonb, synthetic boolean, + extra_headers bytea[][], parents bytea[], object_id bigint ); @@ -471,7 +472,7 @@ r.type, r.directory, r.message, a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, - r.metadata, r.synthetic, t.parents, r.object_id + r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id from swh_revision_list(root_revisions, num_revs) as t left join revision r on t.id = r.id left join person a on a.id = r.author @@ -528,8 +529,8 @@ begin perform swh_person_add_from_revision(); - insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic) - select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic + insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers) + select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers from tmp_revision t left join person a on a.fullname = t.author_fullname left join person c on c.fullname = t.committer_fullname; @@ -791,7 +792,7 @@ select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.type, r.directory, r.message, - a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, + a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers, array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) as parents, r.object_id from revs r diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -217,12 +217,12 @@ "metadata": { "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, "signed-off-by": "some-dude", - "extra_headers": [ - ["gpgsig", b"test123"], - ["mergetag", b"foo\\bar"], - ["mergetag", b"\x22\xaf\x89\x80\x01\x00"], - ], }, + "extra_headers": ( + (b"gpgsig", b"test123"), + (b"mergetag", b"foo\\bar"), + (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), + ), "synthetic": True, } @@ -253,6 +253,7 @@ "type": "git", "directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2 "metadata": None, + "extra_headers": (), "synthetic": False, } @@ -283,6 +284,7 @@ "type": "git", "directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2 "metadata": None, + "extra_headers": (), "synthetic": True, } @@ -315,6 +317,7 @@ "type": "git", "directory": hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), # dir "metadata": None, + "extra_headers": (), "synthetic": False, } diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py --- a/swh/storage/tests/test_converters.py +++ b/swh/storage/tests/test_converters.py @@ -85,6 +85,7 @@ "committer_email": b"comm-email", "metadata": {}, "synthetic": False, + "extra_headers": (), "parents": [123, 456], } ) @@ -109,6 +110,7 @@ "message": b"commit message", "metadata": {}, "synthetic": False, + "extra_headers": (), "parents": [123, 456], } @@ -147,14 +149,3 @@ "target": b"revision-id", "target_type": "revision", } - - -def test_db_to_git_headers(): - raw_data = [ - ["gpgsig", b"garbage\x89a\x43b\x14"], - ["extra", [b"foo\\\\\\o", b"bar\\", b"inval\\\\\x99id"]], - ] - - db_data = converters.git_headers_to_db(raw_data) - loop = converters.db_to_git_headers(db_data) - assert raw_data == loop diff --git a/swh/storage/tests/test_revision_bw_compat.py b/swh/storage/tests/test_revision_bw_compat.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/test_revision_bw_compat.py @@ -0,0 +1,48 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import attr + +from swh.core.utils import decode_with_escape +from swh.model.model import Revision +from swh.storage import get_storage +from swh.storage.tests.storage_data import data +from swh.storage.tests.test_storage import db_transaction + + +def headers_to_db(git_headers): + return [[key, decode_with_escape(value)] for key, value in git_headers] + + +def test_revision_extra_header_in_metadata(swh_storage_backend_config): + storage = get_storage(**swh_storage_backend_config) + rev = Revision.from_dict(data.revision) + + md_w_extra = dict( + rev.metadata.items(), + extra_headers=headers_to_db( + [ + ["gpgsig", b"test123"], + ["mergetag", b"foo\\bar"], + ["mergetag", b"\x22\xaf\x89\x80\x01\x00"], + ] + ), + ) + + bw_rev = attr.evolve(rev, extra_headers=()) + object.__setattr__(bw_rev, "metadata", md_w_extra) + assert bw_rev.extra_headers == () + + assert storage.revision_add([bw_rev]) == {"revision:add": 1} + + # check data in the db are old format + with db_transaction(storage) as (_, cur): + cur.execute("SELECT metadata, extra_headers FROM revision") + metadata, extra_headers = cur.fetchone() + assert extra_headers == [] + assert metadata == bw_rev.metadata + + # check the Revision build from revision_get is the original, "new style", Revision + assert [Revision.from_dict(x) for x in storage.revision_get([rev.id])] == [rev]