diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -24,6 +24,9 @@ [mypy-django.*] ignore_missing_imports = True +[mypy-msgpack.*] +ignore_missing_imports = True + [mypy-multiprocessing.util] ignore_missing_imports = True diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ swh.core[db,http] >= 0.0.94 -swh.model >= 0.3.4 +swh.model >= 0.4.0 swh.objstorage >= 0.0.40 diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tenacity cassandra-driver >= 3.19.0, != 3.21.0 deprecated +msgpack diff --git a/sql/upgrades/158.sql b/sql/upgrades/158.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/158.sql @@ -0,0 +1,13 @@ +-- SWH DB schema upgrade +-- from_version: 155 +-- to_version: 156 +-- description: Make swh_release_add properly idempotent + +-- latest schema version +insert into dbversion(version, release, description) + values(157, now(), 'Work Still In Progress'); + +-- Adapt the revision table for new extra_headers +alter table revision add column (extra_headers bytea[][]); + +-- TODO: add the migration magic query... diff --git a/swh/storage/cassandra/converters.py b/swh/storage/cassandra/converters.py --- a/swh/storage/cassandra/converters.py +++ b/swh/storage/cassandra/converters.py @@ -5,6 +5,7 @@ import datetime import json +import msgpack import attr from copy import deepcopy @@ -22,7 +23,6 @@ ) from swh.model.hashutil import DEFAULT_ALGORITHMS -from ..converters import git_headers_to_db, db_to_git_headers from .common import Row @@ -33,11 +33,11 @@ # non-recursively convert it as a dict but make a deep copy. db_revision = deepcopy(attr.asdict(revision, recurse=False)) metadata = revision.metadata - if metadata and "extra_headers" in metadata: - db_revision["metadata"]["extra_headers"] = git_headers_to_db( - metadata["extra_headers"] - ) + extra_headers = revision.extra_headers + if not extra_headers and metadata and "extra_headers" in metadata: + extra_headers = db_revision["metadata"].pop("extra_headers") db_revision["metadata"] = json.dumps(db_revision["metadata"]) + db_revision["extra_headers"] = msgpack.dumps(extra_headers) db_revision["type"] = db_revision["type"].value return db_revision @@ -45,13 +45,17 @@ def revision_from_db(db_revision: Row, parents: Tuple[Sha1Git]) -> Revision: revision = db_revision._asdict() # type: ignore metadata = json.loads(revision.pop("metadata", None)) - if metadata and "extra_headers" in metadata: - extra_headers = db_to_git_headers(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers + extra_headers = revision.pop("extra_headers", ()) + if not extra_headers: + if metadata and "extra_headers" in metadata: + extra_headers = metadata.pop("extra_headers") + else: + extra_headers = msgpack.loads(extra_headers) return Revision( parents=parents, type=RevisionType(revision.pop("type")), metadata=metadata, + extra_headers=extra_headers, **revision, ) diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -425,6 +425,7 @@ "committer", "synthetic", "metadata", + "extra_headers", ] @_prepared_exists_statement("revision") diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -97,9 +97,10 @@ committer person, synthetic boolean, -- true iff revision has been created by Software Heritage - metadata text - -- extra metadata as JSON(tarball checksums, - -- extra commit information, etc...) + metadata text, + -- extra metadata as JSON(tarball checksums, etc...) + extra_headers blob + -- extra commit information as msgpack (tuple(key, value), ...) ); diff --git a/swh/storage/converters.py b/swh/storage/converters.py --- a/swh/storage/converters.py +++ b/swh/storage/converters.py @@ -168,13 +168,6 @@ committer = author_to_db(revision["committer"]) committer_date = date_to_db(revision["committer_date"]) - metadata = revision["metadata"] - - if metadata and "extra_headers" in metadata: - metadata = metadata.copy() - extra_headers = git_headers_to_db(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers - return { "id": revision["id"], "author_fullname": author["fullname"], @@ -192,8 +185,9 @@ "type": revision["type"], "directory": revision["directory"], "message": revision["message"], - "metadata": metadata, + "metadata": revision["metadata"], "synthetic": revision["synthetic"], + "extra_headers": revision["extra_headers"], "parents": [ {"id": revision["id"], "parent_id": parent, "parent_rank": i,} for i, parent in enumerate(revision["parents"]) @@ -227,12 +221,6 @@ db_revision["committer_date_neg_utc_offset"], ) - metadata = db_revision["metadata"] - - if metadata and "extra_headers" in metadata: - extra_headers = db_to_git_headers(metadata["extra_headers"]) - metadata["extra_headers"] = extra_headers - parents = [] if "parents" in db_revision: for parent in db_revision["parents"]: @@ -248,8 +236,9 @@ "type": db_revision["type"], "directory": db_revision["directory"], "message": db_revision["message"], - "metadata": metadata, + "metadata": db_revision["metadata"], "synthetic": db_revision["synthetic"], + "extra_headers": db_revision.get("extra_headers", ()), "parents": parents, } diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -423,6 +423,7 @@ "committer_email", "metadata", "synthetic", + "extra_headers", ] revision_get_cols = revision_add_cols + ["parents"] diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -239,7 +239,8 @@ metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...) object_id bigserial, date_neg_utc_offset boolean, - committer_date_neg_utc_offset boolean + committer_date_neg_utc_offset boolean, + extra_headers bytea[][] -- extra headers (used in hash computation) ); comment on table revision is 'A revision represents the state of a source code tree at a specific point in time'; @@ -258,6 +259,7 @@ comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage'; comment on column revision.metadata is 'Extra revision metadata'; comment on column revision.object_id is 'Non-intrinsic, sequential object identifier'; +comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation'; -- either this table or the sha1_git[] column on the revision table diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -454,6 +454,7 @@ committer_email bytea, metadata jsonb, synthetic boolean, + extra_headers bytea[][], parents bytea[], object_id bigint ); @@ -471,7 +472,7 @@ r.type, r.directory, r.message, a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, - r.metadata, r.synthetic, t.parents, r.object_id + r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id from swh_revision_list(root_revisions, num_revs) as t left join revision r on t.id = r.id left join person a on a.id = r.author @@ -528,8 +529,8 @@ begin perform swh_person_add_from_revision(); - insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic) - select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic + insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers) + select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers from tmp_revision t left join person a on a.fullname = t.author_fullname left join person c on c.fullname = t.committer_fullname; @@ -791,7 +792,7 @@ select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.type, r.directory, r.message, - a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, + a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers, array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) as parents, r.object_id from revs r diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py --- a/swh/storage/tests/storage_data.py +++ b/swh/storage/tests/storage_data.py @@ -217,12 +217,12 @@ "metadata": { "checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",}, "signed-off-by": "some-dude", - "extra_headers": [ - ["gpgsig", b"test123"], - ["mergetag", b"foo\\bar"], - ["mergetag", b"\x22\xaf\x89\x80\x01\x00"], - ], }, + "extra_headers": ( + (b"gpgsig", b"test123"), + (b"mergetag", b"foo\\bar"), + (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), + ), "synthetic": True, } @@ -253,6 +253,7 @@ "type": "git", "directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2 "metadata": None, + "extra_headers": (), "synthetic": False, } @@ -283,6 +284,7 @@ "type": "git", "directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2 "metadata": None, + "extra_headers": (), "synthetic": True, } @@ -315,6 +317,7 @@ "type": "git", "directory": hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), # dir "metadata": None, + "extra_headers": (), "synthetic": False, } diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py --- a/swh/storage/tests/test_converters.py +++ b/swh/storage/tests/test_converters.py @@ -85,6 +85,7 @@ "committer_email": b"comm-email", "metadata": {}, "synthetic": False, + "extra_headers": (), "parents": [123, 456], } ) @@ -109,6 +110,7 @@ "message": b"commit message", "metadata": {}, "synthetic": False, + "extra_headers": (), "parents": [123, 456], }