Page MenuHomeSoftware Heritage

D3426.diff
No OneTemporary

D3426.diff

diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -24,6 +24,9 @@
[mypy-django.*]
ignore_missing_imports = True
+[mypy-msgpack.*]
+ignore_missing_imports = True
+
[mypy-multiprocessing.util]
ignore_missing_imports = True
diff --git a/requirements-swh-journal.txt b/requirements-swh-journal.txt
--- a/requirements-swh-journal.txt
+++ b/requirements-swh-journal.txt
@@ -1 +1 @@
-swh.journal >= 0.3.2
+swh.journal >= 0.4
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,3 +1,3 @@
-swh.core[db,http] >= 0.0.94
-swh.model >= 0.3.4
+swh.core[db,http] >= 0.1.0
+swh.model >= 0.4.0
swh.objstorage >= 0.0.40
diff --git a/sql/upgrades/158.sql b/sql/upgrades/158.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/158.sql
@@ -0,0 +1,76 @@
+-- SWH DB schema upgrade
+-- from_version: 157
+-- to_version: 158
+-- description: Add the extra_headers column in the revision table
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(158, now(), 'Work Still In Progress');
+
+-- Adapt the revision table for the new extra_headers column
+alter table revision add column (extra_headers bytea[][]);
+
+-- Adapt the revision_entry type for the new extra_headers attribute
+alter type revision_entry add attribute (extra_headers bytea[][]);
+
+-- Create entries in revision from tmp_revision
+create or replace function swh_revision_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_revision();
+
+ insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
+ from tmp_revision t
+ left join person a on a.fullname = t.author_fullname
+ left join person c on c.fullname = t.committer_fullname;
+ return;
+end
+$$;
+
+-- "git style" revision log. Similar to swh_revision_list(), but returning all
+-- information associated to each revision, and expanding authors/committers
+create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email,
+ c.id, c.fullname, c.name, c.email,
+ r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
+ from swh_revision_list(root_revisions, num_revs) as t
+ left join revision r on t.id = r.id
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer;
+$$;
+
+create or replace function swh_revision_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ with revs as (
+ select * from revision
+ where object_id > min_excl and object_id <= max_incl
+ )
+ select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
+ array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
+ as parents, r.object_id
+ from revs r
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer
+ order by r.object_id;
+$$;
+
+-- TODO: add the migration magic query...
diff --git a/swh/storage/cassandra/converters.py b/swh/storage/cassandra/converters.py
--- a/swh/storage/cassandra/converters.py
+++ b/swh/storage/cassandra/converters.py
@@ -22,7 +22,6 @@
)
from swh.model.hashutil import DEFAULT_ALGORITHMS
-from ..converters import git_headers_to_db, db_to_git_headers
from .common import Row
@@ -33,11 +32,11 @@
# non-recursively convert it as a dict but make a deep copy.
db_revision = deepcopy(attr.asdict(revision, recurse=False))
metadata = revision.metadata
- if metadata and "extra_headers" in metadata:
- db_revision["metadata"]["extra_headers"] = git_headers_to_db(
- metadata["extra_headers"]
- )
+ extra_headers = revision.extra_headers
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = db_revision["metadata"].pop("extra_headers")
db_revision["metadata"] = json.dumps(db_revision["metadata"])
+ db_revision["extra_headers"] = extra_headers
db_revision["type"] = db_revision["type"].value
return db_revision
@@ -45,13 +44,16 @@
def revision_from_db(db_revision: Row, parents: Tuple[Sha1Git]) -> Revision:
revision = db_revision._asdict() # type: ignore
metadata = json.loads(revision.pop("metadata", None))
- if metadata and "extra_headers" in metadata:
- extra_headers = db_to_git_headers(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
+ extra_headers = revision.pop("extra_headers", ())
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = metadata.pop("extra_headers")
+ if extra_headers is None:
+ extra_headers = ()
return Revision(
parents=parents,
type=RevisionType(revision.pop("type")),
metadata=metadata,
+ extra_headers=extra_headers,
**revision,
)
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -425,6 +425,7 @@
"committer",
"synthetic",
"metadata",
+ "extra_headers",
]
@_prepared_exists_statement("revision")
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -97,9 +97,10 @@
committer person,
synthetic boolean,
-- true iff revision has been created by Software Heritage
- metadata text
- -- extra metadata as JSON(tarball checksums,
- -- extra commit information, etc...)
+ metadata text,
+ -- extra metadata as JSON(tarball checksums, etc...)
+ extra_headers frozen<list <list<blob>> >
+ -- extra commit information as (tuple(key, value), ...)
);
diff --git a/swh/storage/converters.py b/swh/storage/converters.py
--- a/swh/storage/converters.py
+++ b/swh/storage/converters.py
@@ -7,7 +7,7 @@
from typing import Optional, Dict
-from swh.core.utils import decode_with_escape, encode_with_unescape
+from swh.core.utils import encode_with_unescape
from swh.model import identifiers
from swh.model.hashutil import MultiHash
@@ -64,31 +64,10 @@
}
-def git_headers_to_db(git_headers):
- """Convert git headers to their database representation.
-
- We convert the bytes to unicode by decoding them into utf-8 and replacing
- invalid utf-8 sequences with backslash escapes.
-
- """
- ret = []
- for key, values in git_headers:
- if isinstance(values, list):
- ret.append([key, [decode_with_escape(value) for value in values]])
- else:
- ret.append([key, decode_with_escape(values)])
-
- return ret
-
-
def db_to_git_headers(db_git_headers):
ret = []
- for key, values in db_git_headers:
- if isinstance(values, list):
- ret.append([key, [encode_with_unescape(value) for value in values]])
- else:
- ret.append([key, encode_with_unescape(values)])
-
+ for key, value in db_git_headers:
+ ret.append([key.encode("utf-8"), encode_with_unescape(value)])
return ret
@@ -168,13 +147,6 @@
committer = author_to_db(revision["committer"])
committer_date = date_to_db(revision["committer_date"])
- metadata = revision["metadata"]
-
- if metadata and "extra_headers" in metadata:
- metadata = metadata.copy()
- extra_headers = git_headers_to_db(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
-
return {
"id": revision["id"],
"author_fullname": author["fullname"],
@@ -192,8 +164,9 @@
"type": revision["type"],
"directory": revision["directory"],
"message": revision["message"],
- "metadata": metadata,
+ "metadata": revision["metadata"],
"synthetic": revision["synthetic"],
+ "extra_headers": revision["extra_headers"],
"parents": [
{"id": revision["id"], "parent_id": parent, "parent_rank": i,}
for i, parent in enumerate(revision["parents"])
@@ -227,18 +200,17 @@
db_revision["committer_date_neg_utc_offset"],
)
- metadata = db_revision["metadata"]
-
- if metadata and "extra_headers" in metadata:
- extra_headers = db_to_git_headers(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
-
parents = []
if "parents" in db_revision:
for parent in db_revision["parents"]:
if parent:
parents.append(parent)
+ metadata = db_revision["metadata"]
+ extra_headers = db_revision.get("extra_headers", ())
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = db_to_git_headers(metadata.pop("extra_headers"))
+
ret = {
"id": db_revision["id"],
"author": author,
@@ -250,6 +222,7 @@
"message": db_revision["message"],
"metadata": metadata,
"synthetic": db_revision["synthetic"],
+ "extra_headers": extra_headers,
"parents": parents,
}
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -423,6 +423,7 @@
"committer_email",
"metadata",
"synthetic",
+ "extra_headers",
]
revision_get_cols = revision_add_cols + ["parents"]
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(157, now(), 'Work In Progress');
+ values(158, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -239,7 +239,8 @@
metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...)
object_id bigserial,
date_neg_utc_offset boolean,
- committer_date_neg_utc_offset boolean
+ committer_date_neg_utc_offset boolean,
+ extra_headers bytea[][] -- extra headers (used in hash computation)
);
comment on table revision is 'A revision represents the state of a source code tree at a specific point in time';
@@ -258,6 +259,7 @@
comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage';
comment on column revision.metadata is 'Extra revision metadata';
comment on column revision.object_id is 'Non-intrinsic, sequential object identifier';
+comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation';
-- either this table or the sha1_git[] column on the revision table
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -454,6 +454,7 @@
committer_email bytea,
metadata jsonb,
synthetic boolean,
+ extra_headers bytea[][],
parents bytea[],
object_id bigint
);
@@ -471,7 +472,7 @@
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email,
c.id, c.fullname, c.name, c.email,
- r.metadata, r.synthetic, t.parents, r.object_id
+ r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
from swh_revision_list(root_revisions, num_revs) as t
left join revision r on t.id = r.id
left join person a on a.id = r.author
@@ -528,8 +529,8 @@
begin
perform swh_person_add_from_revision();
- insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic)
- select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic
+ insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
from tmp_revision t
left join person a on a.fullname = t.author_fullname
left join person c on c.fullname = t.committer_fullname;
@@ -791,7 +792,7 @@
select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
r.type, r.directory, r.message,
- a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
as parents, r.object_id
from revs r
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -217,12 +217,12 @@
"metadata": {
"checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",},
"signed-off-by": "some-dude",
- "extra_headers": [
- ["gpgsig", b"test123"],
- ["mergetag", b"foo\\bar"],
- ["mergetag", b"\x22\xaf\x89\x80\x01\x00"],
- ],
},
+ "extra_headers": (
+ (b"gpgsig", b"test123"),
+ (b"mergetag", b"foo\\bar"),
+ (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
+ ),
"synthetic": True,
}
@@ -253,6 +253,7 @@
"type": "git",
"directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2
"metadata": None,
+ "extra_headers": (),
"synthetic": False,
}
@@ -283,6 +284,7 @@
"type": "git",
"directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2
"metadata": None,
+ "extra_headers": (),
"synthetic": True,
}
@@ -315,6 +317,7 @@
"type": "git",
"directory": hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), # dir
"metadata": None,
+ "extra_headers": (),
"synthetic": False,
}
diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py
--- a/swh/storage/tests/test_converters.py
+++ b/swh/storage/tests/test_converters.py
@@ -85,6 +85,7 @@
"committer_email": b"comm-email",
"metadata": {},
"synthetic": False,
+ "extra_headers": (),
"parents": [123, 456],
}
)
@@ -109,6 +110,7 @@
"message": b"commit message",
"metadata": {},
"synthetic": False,
+ "extra_headers": (),
"parents": [123, 456],
}
@@ -147,14 +149,3 @@
"target": b"revision-id",
"target_type": "revision",
}
-
-
-def test_db_to_git_headers():
- raw_data = [
- ["gpgsig", b"garbage\x89a\x43b\x14"],
- ["extra", [b"foo\\\\\\o", b"bar\\", b"inval\\\\\x99id"]],
- ]
-
- db_data = converters.git_headers_to_db(raw_data)
- loop = converters.db_to_git_headers(db_data)
- assert raw_data == loop
diff --git a/swh/storage/tests/test_revision_bw_compat.py b/swh/storage/tests/test_revision_bw_compat.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_revision_bw_compat.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import attr
+
+from swh.core.utils import decode_with_escape
+from swh.model.model import Revision
+from swh.storage import get_storage
+from swh.storage.tests.storage_data import data
+from swh.storage.tests.test_storage import db_transaction
+
+
+def headers_to_db(git_headers):
+ return [[key, decode_with_escape(value)] for key, value in git_headers]
+
+
+def test_revision_extra_header_in_metadata(swh_storage_backend_config):
+ storage = get_storage(**swh_storage_backend_config)
+ rev = Revision.from_dict(data.revision)
+
+ md_w_extra = dict(
+ rev.metadata.items(),
+ extra_headers=headers_to_db(
+ [
+ ["gpgsig", b"test123"],
+ ["mergetag", b"foo\\bar"],
+ ["mergetag", b"\x22\xaf\x89\x80\x01\x00"],
+ ]
+ ),
+ )
+
+ bw_rev = attr.evolve(rev, extra_headers=())
+ object.__setattr__(bw_rev, "metadata", md_w_extra)
+ assert bw_rev.extra_headers == ()
+
+ assert storage.revision_add([bw_rev]) == {"revision:add": 1}
+
+ # check data in the db are old format
+ with db_transaction(storage) as (_, cur):
+ cur.execute("SELECT metadata, extra_headers FROM revision")
+ metadata, extra_headers = cur.fetchone()
+ assert extra_headers == []
+ assert metadata == bw_rev.metadata
+
+ # check the Revision build from revision_get is the original, "new style", Revision
+ assert [Revision.from_dict(x) for x in storage.revision_get([rev.id])] == [rev]

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 3:47 PM (2 d, 20 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223899

Event Timeline