Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123012
D3426.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
D3426.diff
View Options
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -24,6 +24,9 @@
[mypy-django.*]
ignore_missing_imports = True
+[mypy-msgpack.*]
+ignore_missing_imports = True
+
[mypy-multiprocessing.util]
ignore_missing_imports = True
diff --git a/requirements-swh-journal.txt b/requirements-swh-journal.txt
--- a/requirements-swh-journal.txt
+++ b/requirements-swh-journal.txt
@@ -1 +1 @@
-swh.journal >= 0.3.2
+swh.journal >= 0.4
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,3 +1,3 @@
-swh.core[db,http] >= 0.0.94
-swh.model >= 0.3.4
+swh.core[db,http] >= 0.1.0
+swh.model >= 0.4.0
swh.objstorage >= 0.0.40
diff --git a/sql/upgrades/158.sql b/sql/upgrades/158.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/158.sql
@@ -0,0 +1,76 @@
+-- SWH DB schema upgrade
+-- from_version: 157
+-- to_version: 158
+-- description: Add the extra_headers column in the revision table
+
+-- latest schema version
+insert into dbversion(version, release, description)
+ values(158, now(), 'Work Still In Progress');
+
+-- Adapt the revision table for the new extra_headers column
+alter table revision add column (extra_headers bytea[][]);
+
+-- Adapt the revision_entry type for the new extra_headers attribute
+alter type revision_entry add attribute (extra_headers bytea[][]);
+
+-- Create entries in revision from tmp_revision
+create or replace function swh_revision_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_revision();
+
+ insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
+ from tmp_revision t
+ left join person a on a.fullname = t.author_fullname
+ left join person c on c.fullname = t.committer_fullname;
+ return;
+end
+$$;
+
+-- "git style" revision log. Similar to swh_revision_list(), but returning all
+-- information associated to each revision, and expanding authors/committers
+create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email,
+ c.id, c.fullname, c.name, c.email,
+ r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
+ from swh_revision_list(root_revisions, num_revs) as t
+ left join revision r on t.id = r.id
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer;
+$$;
+
+create or replace function swh_revision_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ with revs as (
+ select * from revision
+ where object_id > min_excl and object_id <= max_incl
+ )
+ select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
+ array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
+ as parents, r.object_id
+ from revs r
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer
+ order by r.object_id;
+$$;
+
+-- TODO: add the migration magic query...
diff --git a/swh/storage/cassandra/converters.py b/swh/storage/cassandra/converters.py
--- a/swh/storage/cassandra/converters.py
+++ b/swh/storage/cassandra/converters.py
@@ -22,7 +22,6 @@
)
from swh.model.hashutil import DEFAULT_ALGORITHMS
-from ..converters import git_headers_to_db, db_to_git_headers
from .common import Row
@@ -33,11 +32,11 @@
# non-recursively convert it as a dict but make a deep copy.
db_revision = deepcopy(attr.asdict(revision, recurse=False))
metadata = revision.metadata
- if metadata and "extra_headers" in metadata:
- db_revision["metadata"]["extra_headers"] = git_headers_to_db(
- metadata["extra_headers"]
- )
+ extra_headers = revision.extra_headers
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = db_revision["metadata"].pop("extra_headers")
db_revision["metadata"] = json.dumps(db_revision["metadata"])
+ db_revision["extra_headers"] = extra_headers
db_revision["type"] = db_revision["type"].value
return db_revision
@@ -45,13 +44,16 @@
def revision_from_db(db_revision: Row, parents: Tuple[Sha1Git]) -> Revision:
revision = db_revision._asdict() # type: ignore
metadata = json.loads(revision.pop("metadata", None))
- if metadata and "extra_headers" in metadata:
- extra_headers = db_to_git_headers(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
+ extra_headers = revision.pop("extra_headers", ())
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = metadata.pop("extra_headers")
+ if extra_headers is None:
+ extra_headers = ()
return Revision(
parents=parents,
type=RevisionType(revision.pop("type")),
metadata=metadata,
+ extra_headers=extra_headers,
**revision,
)
diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py
--- a/swh/storage/cassandra/cql.py
+++ b/swh/storage/cassandra/cql.py
@@ -425,6 +425,7 @@
"committer",
"synthetic",
"metadata",
+ "extra_headers",
]
@_prepared_exists_statement("revision")
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -97,9 +97,10 @@
committer person,
synthetic boolean,
-- true iff revision has been created by Software Heritage
- metadata text
- -- extra metadata as JSON(tarball checksums,
- -- extra commit information, etc...)
+ metadata text,
+ -- extra metadata as JSON(tarball checksums, etc...)
+ extra_headers frozen<list <list<blob>> >
+ -- extra commit information as (tuple(key, value), ...)
);
diff --git a/swh/storage/converters.py b/swh/storage/converters.py
--- a/swh/storage/converters.py
+++ b/swh/storage/converters.py
@@ -7,7 +7,7 @@
from typing import Optional, Dict
-from swh.core.utils import decode_with_escape, encode_with_unescape
+from swh.core.utils import encode_with_unescape
from swh.model import identifiers
from swh.model.hashutil import MultiHash
@@ -64,31 +64,10 @@
}
-def git_headers_to_db(git_headers):
- """Convert git headers to their database representation.
-
- We convert the bytes to unicode by decoding them into utf-8 and replacing
- invalid utf-8 sequences with backslash escapes.
-
- """
- ret = []
- for key, values in git_headers:
- if isinstance(values, list):
- ret.append([key, [decode_with_escape(value) for value in values]])
- else:
- ret.append([key, decode_with_escape(values)])
-
- return ret
-
-
def db_to_git_headers(db_git_headers):
ret = []
- for key, values in db_git_headers:
- if isinstance(values, list):
- ret.append([key, [encode_with_unescape(value) for value in values]])
- else:
- ret.append([key, encode_with_unescape(values)])
-
+ for key, value in db_git_headers:
+ ret.append([key.encode("utf-8"), encode_with_unescape(value)])
return ret
@@ -168,13 +147,6 @@
committer = author_to_db(revision["committer"])
committer_date = date_to_db(revision["committer_date"])
- metadata = revision["metadata"]
-
- if metadata and "extra_headers" in metadata:
- metadata = metadata.copy()
- extra_headers = git_headers_to_db(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
-
return {
"id": revision["id"],
"author_fullname": author["fullname"],
@@ -192,8 +164,9 @@
"type": revision["type"],
"directory": revision["directory"],
"message": revision["message"],
- "metadata": metadata,
+ "metadata": revision["metadata"],
"synthetic": revision["synthetic"],
+ "extra_headers": revision["extra_headers"],
"parents": [
{"id": revision["id"], "parent_id": parent, "parent_rank": i,}
for i, parent in enumerate(revision["parents"])
@@ -227,18 +200,17 @@
db_revision["committer_date_neg_utc_offset"],
)
- metadata = db_revision["metadata"]
-
- if metadata and "extra_headers" in metadata:
- extra_headers = db_to_git_headers(metadata["extra_headers"])
- metadata["extra_headers"] = extra_headers
-
parents = []
if "parents" in db_revision:
for parent in db_revision["parents"]:
if parent:
parents.append(parent)
+ metadata = db_revision["metadata"]
+ extra_headers = db_revision.get("extra_headers", ())
+ if not extra_headers and metadata and "extra_headers" in metadata:
+ extra_headers = db_to_git_headers(metadata.pop("extra_headers"))
+
ret = {
"id": db_revision["id"],
"author": author,
@@ -250,6 +222,7 @@
"message": db_revision["message"],
"metadata": metadata,
"synthetic": db_revision["synthetic"],
+ "extra_headers": extra_headers,
"parents": parents,
}
diff --git a/swh/storage/db.py b/swh/storage/db.py
--- a/swh/storage/db.py
+++ b/swh/storage/db.py
@@ -423,6 +423,7 @@
"committer_email",
"metadata",
"synthetic",
+ "extra_headers",
]
revision_get_cols = revision_add_cols + ["parents"]
diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql
--- a/swh/storage/sql/30-swh-schema.sql
+++ b/swh/storage/sql/30-swh-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(157, now(), 'Work In Progress');
+ values(158, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -239,7 +239,8 @@
metadata jsonb, -- extra metadata (tarball checksums, extra commit information, etc...)
object_id bigserial,
date_neg_utc_offset boolean,
- committer_date_neg_utc_offset boolean
+ committer_date_neg_utc_offset boolean,
+ extra_headers bytea[][] -- extra headers (used in hash computation)
);
comment on table revision is 'A revision represents the state of a source code tree at a specific point in time';
@@ -258,6 +259,7 @@
comment on column revision.synthetic is 'True iff revision has been synthesized by Software Heritage';
comment on column revision.metadata is 'Extra revision metadata';
comment on column revision.object_id is 'Non-intrinsic, sequential object identifier';
+comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation';
-- either this table or the sha1_git[] column on the revision table
diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql
--- a/swh/storage/sql/40-swh-func.sql
+++ b/swh/storage/sql/40-swh-func.sql
@@ -454,6 +454,7 @@
committer_email bytea,
metadata jsonb,
synthetic boolean,
+ extra_headers bytea[][],
parents bytea[],
object_id bigint
);
@@ -471,7 +472,7 @@
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email,
c.id, c.fullname, c.name, c.email,
- r.metadata, r.synthetic, t.parents, r.object_id
+ r.metadata, r.synthetic, r.extra_headers, t.parents, r.object_id
from swh_revision_list(root_revisions, num_revs) as t
left join revision r on t.id = r.id
left join person a on a.id = r.author
@@ -528,8 +529,8 @@
begin
perform swh_person_add_from_revision();
- insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic)
- select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic
+ insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
from tmp_revision t
left join person a on a.fullname = t.author_fullname
left join person c on c.fullname = t.committer_fullname;
@@ -791,7 +792,7 @@
select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
r.type, r.directory, r.message,
- a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, r.extra_headers,
array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
as parents, r.object_id
from revs r
diff --git a/swh/storage/tests/storage_data.py b/swh/storage/tests/storage_data.py
--- a/swh/storage/tests/storage_data.py
+++ b/swh/storage/tests/storage_data.py
@@ -217,12 +217,12 @@
"metadata": {
"checksums": {"sha1": "tarball-sha1", "sha256": "tarball-sha256",},
"signed-off-by": "some-dude",
- "extra_headers": [
- ["gpgsig", b"test123"],
- ["mergetag", b"foo\\bar"],
- ["mergetag", b"\x22\xaf\x89\x80\x01\x00"],
- ],
},
+ "extra_headers": (
+ (b"gpgsig", b"test123"),
+ (b"mergetag", b"foo\\bar"),
+ (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
+ ),
"synthetic": True,
}
@@ -253,6 +253,7 @@
"type": "git",
"directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2
"metadata": None,
+ "extra_headers": (),
"synthetic": False,
}
@@ -283,6 +284,7 @@
"type": "git",
"directory": hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), # dir2
"metadata": None,
+ "extra_headers": (),
"synthetic": True,
}
@@ -315,6 +317,7 @@
"type": "git",
"directory": hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), # dir
"metadata": None,
+ "extra_headers": (),
"synthetic": False,
}
diff --git a/swh/storage/tests/test_converters.py b/swh/storage/tests/test_converters.py
--- a/swh/storage/tests/test_converters.py
+++ b/swh/storage/tests/test_converters.py
@@ -85,6 +85,7 @@
"committer_email": b"comm-email",
"metadata": {},
"synthetic": False,
+ "extra_headers": (),
"parents": [123, 456],
}
)
@@ -109,6 +110,7 @@
"message": b"commit message",
"metadata": {},
"synthetic": False,
+ "extra_headers": (),
"parents": [123, 456],
}
@@ -147,14 +149,3 @@
"target": b"revision-id",
"target_type": "revision",
}
-
-
-def test_db_to_git_headers():
- raw_data = [
- ["gpgsig", b"garbage\x89a\x43b\x14"],
- ["extra", [b"foo\\\\\\o", b"bar\\", b"inval\\\\\x99id"]],
- ]
-
- db_data = converters.git_headers_to_db(raw_data)
- loop = converters.db_to_git_headers(db_data)
- assert raw_data == loop
diff --git a/swh/storage/tests/test_revision_bw_compat.py b/swh/storage/tests/test_revision_bw_compat.py
new file mode 100644
--- /dev/null
+++ b/swh/storage/tests/test_revision_bw_compat.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import attr
+
+from swh.core.utils import decode_with_escape
+from swh.model.model import Revision
+from swh.storage import get_storage
+from swh.storage.tests.storage_data import data
+from swh.storage.tests.test_storage import db_transaction
+
+
+def headers_to_db(git_headers):
+ return [[key, decode_with_escape(value)] for key, value in git_headers]
+
+
+def test_revision_extra_header_in_metadata(swh_storage_backend_config):
+ storage = get_storage(**swh_storage_backend_config)
+ rev = Revision.from_dict(data.revision)
+
+ md_w_extra = dict(
+ rev.metadata.items(),
+ extra_headers=headers_to_db(
+ [
+ ["gpgsig", b"test123"],
+ ["mergetag", b"foo\\bar"],
+ ["mergetag", b"\x22\xaf\x89\x80\x01\x00"],
+ ]
+ ),
+ )
+
+ bw_rev = attr.evolve(rev, extra_headers=())
+ object.__setattr__(bw_rev, "metadata", md_w_extra)
+ assert bw_rev.extra_headers == ()
+
+ assert storage.revision_add([bw_rev]) == {"revision:add": 1}
+
+ # check data in the db are old format
+ with db_transaction(storage) as (_, cur):
+ cur.execute("SELECT metadata, extra_headers FROM revision")
+ metadata, extra_headers = cur.fetchone()
+ assert extra_headers == []
+ assert metadata == bw_rev.metadata
+
+ # check the Revision build from revision_get is the original, "new style", Revision
+ assert [Revision.from_dict(x) for x in storage.revision_get([rev.id])] == [rev]
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 3:47 PM (3 d, 2 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223899
Attached To
D3426: Extract revision's extra_header as a top level attribute
Event Timeline
Log In to Comment