diff --git a/sql/upgrades/179.sql b/sql/upgrades/179.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/179.sql @@ -0,0 +1,159 @@ +-- SWH DB schema upgrade +-- from_version: 178 +-- to_version: 179 +-- description: add {,committer_}date_offset_bytes to rev/rel + raw_manifest to dir/rev/rel, part 1 + +insert into dbversion(version, release, description) + values(179, now(), 'Work In Progress'); + +-- step 1: add columns, update functions + +alter table release + add column date_offset_bytes bytea, + add column raw_manifest bytea; +comment on column release.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column release.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; + + +alter table revision + add column date_offset_bytes bytea, + add column committer_date_offset_bytes bytea, + add column raw_manifest bytea; +comment on column revision.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column revision.committer_date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column revision.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; + +drop function swh_revision_log; +drop function swh_revision_list_by_object_id; +drop function swh_revision_add; +drop type revision_entry; +create type revision_entry as +( + id sha1_git, + date timestamptz, + date_offset smallint, + date_neg_utc_offset boolean, + date_offset_bytes bytea, + committer_date timestamptz, + committer_date_offset smallint, + committer_date_neg_utc_offset boolean, + committer_date_offset_bytes bytea, + type revision_type, + directory sha1_git, + message bytea, + author_id bigint, + author_fullname bytea, + author_name bytea, + author_email bytea, + committer_id bigint, + committer_fullname bytea, + committer_name bytea, + committer_email bytea, + metadata jsonb, + synthetic boolean, + parents bytea[], + object_id bigint, + extra_headers bytea[][], + raw_manifest bytea +); + +alter table directory + add column raw_manifest bytea; +comment on column directory.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; + +create or replace function swh_directory_add() + returns void + language plpgsql +as $$ +begin + perform swh_directory_entry_add('file'); + perform swh_directory_entry_add('dir'); + perform swh_directory_entry_add('rev'); + + insert into directory (id, dir_entries, file_entries, rev_entries, raw_manifest) + select id, dir_entries, file_entries, rev_entries, raw_manifest from tmp_directory t + where not exists ( + select 1 from directory d + where d.id = t.id); + + return; +end +$$; + +create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL) + returns setof revision_entry + language sql + stable +as $$ + select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, + r.type, r.directory, r.message, + a.id, a.fullname, a.name, a.email, + c.id, c.fullname, c.name, c.email, + r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers, + r.raw_manifest + from swh_revision_list(root_revisions, num_revs) as t + left join revision r on t.id = r.id + left join person a on a.id = r.author + left join person c on c.id = r.committer; +$$; + +create or replace function swh_revision_list_by_object_id( + min_excl bigint, + max_incl bigint +) + returns setof revision_entry + language sql + stable +as $$ + with revs as ( + select * from revision + where object_id > min_excl and object_id <= max_incl + ) + select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, + r.type, r.directory, r.message, + a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, + array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) + as parents, r.object_id, r.extra_headers, r.raw_manifest + from revs r + left join person a on a.id = r.author + left join person c on c.id = r.committer + order by r.object_id; +$$; + +create or replace function swh_revision_add() + returns void + language plpgsql +as $$ +begin + perform swh_person_add_from_revision(); + + insert into revision (id, date, date_offset, date_neg_utc_offset, date_offset_bytes, committer_date, committer_date_offset, committer_date_neg_utc_offset, committer_date_offset_bytes, type, directory, message, author, committer, metadata, synthetic, extra_headers, raw_manifest) + select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.committer_date_offset_bytes, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers, t.raw_manifest + from tmp_revision t + left join person a on a.fullname = t.author_fullname + left join person c on c.fullname = t.committer_fullname; + return; +end +$$; + +create or replace function swh_release_add() + returns void + language plpgsql +as $$ +begin + perform swh_person_add_from_release(); + + insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, date_offset_bytes, name, comment, author, synthetic, raw_manifest) + select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.name, t.comment, a.id, t.synthetic, t.raw_manifest + from tmp_release t + left join person a on a.fullname = t.author_fullname + where not exists (select 1 from release where t.id = release.id); + return; +end +$$; + +-- step 2: upgrade python code to start writing to them + +-- data migrations in 180.sql diff --git a/sql/upgrades/180.sql b/sql/upgrades/180.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/180.sql @@ -0,0 +1,88 @@ +-- SWH DB schema upgrade +-- from_version: 177 +-- to_version: 180 +-- description: add {,committer_}date_offset_bytes to rev/rel + raw_manifest to dir/rev/rel, part 2 + +insert into dbversion(version, release, description) + values(180, now(), 'Work In Progress'); + +-- copied from 60-indexes.sql +select swh_get_dbflavor() != 'read_replica' as dbflavor_does_deduplication \gset + +-- step 3: fill the offsets + +create or replace function _format_offset(offset_ smallint, neg_utc_offset bool) + returns bytea + language plpgsql +as $$ + begin + return convert_to( + -- sign + case when offset_ < 0 or neg_utc_offset then '-' else '+' end + -- hours (unfortunately we can't use lpad because it truncates) + || case when abs(offset_) >= 600 then + cast(abs(offset_) / 60 as text) + else + '0' || cast(abs(offset_) / 60 as text) + end + -- minutes + || lpad(cast(mod(abs(offset_), 60) as text), 2, '0'), + 'utf8' + ); + end +$$; + +-- make sure it's correct +do $$ begin + assert (select _format_offset(NULL::smallint, NULL::bool)) is not distinct from NULL; + assert (select _format_offset(0::smallint, false)) = '+0000'::bytea; + assert (select _format_offset(0::smallint, true)) = '-0000'::bytea; + assert (select _format_offset(1::smallint, false)) = '+0001'::bytea; + assert (select _format_offset(-1::smallint, false)) = '-0001'::bytea; + assert (select _format_offset(120::smallint, false)) = '+0200'::bytea; + assert (select _format_offset(-120::smallint, false)) = '-0200'::bytea; + assert (select _format_offset(6000::smallint, false)) = '+10000'::bytea; + assert (select _format_offset(-6000::smallint, false)) = '-10000'::bytea; +end$$; + +update release + set date_offset_bytes=_format_offset(date_offset, date_neg_utc_offset) + where date is not null and date_offset_bytes is null; + +update revision + set date_offset_bytes=_format_offset(date_offset, date_neg_utc_offset), + committer_date_offset_bytes=_format_offset(committer_date_offset, committer_date_neg_utc_offset) + where (date is not null and date_offset_bytes is null) + or (committer_date is not null and committer_date_offset_bytes is null); + + +-- step 4: add integrity constraints + +\if :dbflavor_does_deduplication + + -- add new constraint on release dates + alter table release + add constraint release_date_offset_not_null + check (date is null or date_offset_bytes is not null) + not valid; + alter table release + validate constraint release_date_offset_not_null; + + alter table revision + add constraint revision_date_offset_not_null + check (date is null or date_offset_bytes is not null) + not valid; + + alter table revision + add constraint revision_committer_date_offset_not_null + check (committer_date is null or committer_date_offset_bytes is not null) + not valid; + + alter table revision + validate constraint revision_date_offset_not_null; + alter table revision + validate constraint revision_committer_date_offset_not_null; +\endif + +-- step 5: remove the old columns (date_offset and date_neg_utc_offset): in a future migration... + diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py --- a/swh/storage/backfill.py +++ b/swh/storage/backfill.py @@ -79,7 +79,7 @@ "status", "reason", ], - "directory": ["id", "dir_entries", "file_entries", "rev_entries"], + "directory": ["id", "dir_entries", "file_entries", "rev_entries", "raw_manifest"], "extid": ["extid_type", "extid", "extid_version", "target_type", "target"], "metadata_authority": ["type", "url"], "metadata_fetcher": ["name", "version"], @@ -117,9 +117,11 @@ "date", "date_offset", "date_neg_utc_offset", + "date_offset_bytes", "committer_date", "committer_date_offset", "committer_date_neg_utc_offset", + "committer_date_offset_bytes", "type", "directory", "message", @@ -131,6 +133,7 @@ "where rh.id = revision.id order by rh.parent_rank asc)", "parents", ), + "raw_manifest", ("a.id", "author_id"), ("a.name", "author_name"), ("a.email", "author_email"), @@ -145,6 +148,7 @@ "date", "date_offset", "date_neg_utc_offset", + "date_offset_bytes", "comment", ("release.name", "name"), "synthetic", @@ -154,6 +158,7 @@ ("a.name", "author_name"), ("a.email", "author_email"), ("a.fullname", "author_fullname"), + "raw_manifest", ], "snapshot": ["id", "object_id"], } diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py --- a/swh/storage/cassandra/model.py +++ b/swh/storage/cassandra/model.py @@ -114,6 +114,7 @@ PARTITION_KEY = ("id",) id: bytes + raw_manifest: Optional[bytes] @dataclasses.dataclass @@ -145,6 +146,7 @@ synthetic: bool metadata: str extra_headers: dict + raw_manifest: Optional[bytes] @dataclasses.dataclass @@ -171,6 +173,7 @@ message: bytes author: Person synthetic: bool + raw_manifest: Optional[bytes] @dataclasses.dataclass diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py --- a/swh/storage/cassandra/schema.py +++ b/swh/storage/cassandra/schema.py @@ -56,13 +56,14 @@ """ CREATE TYPE IF NOT EXISTS microtimestamp ( seconds bigint, - microseconds int + microseconds int, );""", """ CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone ( timestamp frozen, offset smallint, - negative_utc boolean + negative_utc boolean, + offset_bytes blob, );""", """ CREATE TYPE IF NOT EXISTS person ( @@ -110,8 +111,10 @@ -- true iff revision has been created by Software Heritage metadata text, -- extra metadata as JSON(tarball checksums, etc...) - extra_headers frozen> > + extra_headers frozen> >, -- extra commit information as (tuple(key, value), ...) + raw_manifest blob, + -- NULL if the object can be rebuild from other cells and revision_parent. );""", """ CREATE TABLE IF NOT EXISTS revision_parent ( @@ -133,10 +136,14 @@ author person, synthetic boolean, -- true iff release has been created by Software Heritage + raw_manifest blob, + -- NULL if the object can be rebuild from other cells );""", """ CREATE TABLE IF NOT EXISTS directory ( id blob PRIMARY KEY, + raw_manifest blob + -- NULL if the object can be rebuild from (sorted) entries );""", """ CREATE TABLE IF NOT EXISTS directory_entry ( diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -504,7 +504,9 @@ # Add the directory *after* adding all the entries, so someone # calling snapshot_get_branch in the meantime won't end up # with half the entries. - self._cql_runner.directory_add_one(DirectoryRow(id=directory.id)) + self._cql_runner.directory_add_one( + DirectoryRow(id=directory.id, raw_manifest=directory.raw_manifest) + ) return {"directory:add": len(directories)} diff --git a/swh/storage/postgresql/converters.py b/swh/storage/postgresql/converters.py --- a/swh/storage/postgresql/converters.py +++ b/swh/storage/postgresql/converters.py @@ -39,6 +39,7 @@ "timestamp": None, "offset": 0, "neg_utc_offset": None, + "offset_bytes": None, } @@ -84,7 +85,10 @@ def db_to_date( - date: Optional[datetime.datetime], offset: int, neg_utc_offset: Optional[bool] + date: Optional[datetime.datetime], + offset: int, + neg_utc_offset: Optional[bool], + offset_bytes: Optional[bytes], ) -> Optional[TimestampWithTimezone]: """Convert the DB representation of a date to a swh-model compatible date. @@ -105,6 +109,11 @@ # For older versions of the database that were not migrated to schema v160 neg_utc_offset = False + kwargs = {} + if offset_bytes: + # TODO: remove the conditional after migration is complete. + kwargs["offset_bytes"] = offset_bytes + return TimestampWithTimezone( timestamp=Timestamp( # we use floor() instead of int() to round down, because of negative dates @@ -113,6 +122,7 @@ ), offset=offset, negative_utc=neg_utc_offset, + **kwargs, ) @@ -123,12 +133,14 @@ ts_with_tz: a TimestampWithTimezone object Returns: - dict: a dictionary with three keys: + dict: a dictionary with these keys: - timestamp: a date in ISO format - offset: the UTC offset in minutes - neg_utc_offset: a boolean indicating whether a null offset is negative or positive. + - offset_bytes: a byte representation of the latter two, usually as "+HHMM" + or "-HHMM" """ @@ -145,6 +157,7 @@ "timestamp": timestamp.isoformat(), "offset": ts_with_tz.offset, "neg_utc_offset": ts_with_tz.negative_utc, + "offset_bytes": ts_with_tz.offset_bytes, } @@ -165,18 +178,21 @@ "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], + "date_offset_bytes": date["offset_bytes"], "committer_fullname": committer["fullname"], "committer_name": committer["name"], "committer_email": committer["email"], "committer_date": committer_date["timestamp"], "committer_date_offset": committer_date["offset"], "committer_date_neg_utc_offset": committer_date["neg_utc_offset"], + "committer_date_offset_bytes": committer_date["offset_bytes"], "type": revision.type.value, "directory": revision.directory, "message": revision.message, "metadata": None if revision.metadata is None else dict(revision.metadata), "synthetic": revision.synthetic, "extra_headers": revision.extra_headers, + "raw_manifest": revision.raw_manifest, "parents": [ {"id": revision.id, "parent_id": parent, "parent_rank": i,} for i, parent in enumerate(revision.parents) @@ -202,6 +218,7 @@ db_revision["date"], db_revision["date_offset"], db_revision["date_neg_utc_offset"], + db_revision["date_offset_bytes"], ) committer = db_to_author( @@ -213,6 +230,7 @@ db_revision["committer_date"], db_revision["committer_date_offset"], db_revision["committer_date_neg_utc_offset"], + db_revision["committer_date_offset_bytes"], ) assert author, "author is None" @@ -246,6 +264,7 @@ synthetic=db_revision["synthetic"], extra_headers=extra_headers, parents=tuple(parents), + raw_manifest=db_revision["raw_manifest"], ) @@ -263,11 +282,13 @@ "date": date["timestamp"], "date_offset": date["offset"], "date_neg_utc_offset": date["neg_utc_offset"], + "date_offset_bytes": date["offset_bytes"], "name": release.name, "target": release.target, "target_type": release.target_type.value, "comment": release.message, "synthetic": release.synthetic, + "raw_manifest": release.raw_manifest, } @@ -285,7 +306,10 @@ db_release["author_email"], ) date = db_to_date( - db_release["date"], db_release["date_offset"], db_release["date_neg_utc_offset"] + db_release["date"], + db_release["date_offset"], + db_release["date_neg_utc_offset"], + db_release["date_offset_bytes"], ) return Release( @@ -297,6 +321,7 @@ synthetic=db_release["synthetic"], target=db_release["target"], target_type=ObjectType(db_release["target_type"]), + raw_manifest=db_release["raw_manifest"], ) diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -30,7 +30,7 @@ """ - current_version = 178 + current_version = 180 def mktemp_dir_entry(self, entry_type, cur=None): self._cursor(cur).execute( @@ -437,9 +437,11 @@ "date", "date_offset", "date_neg_utc_offset", + "date_offset_bytes", "committer_date", "committer_date_offset", "committer_date_neg_utc_offset", + "committer_date_offset_bytes", "type", "directory", "message", @@ -452,6 +454,7 @@ "metadata", "synthetic", "extra_headers", + "raw_manifest", ] revision_get_cols = revision_add_cols + ["parents"] @@ -1200,9 +1203,11 @@ "date", "date_offset", "date_neg_utc_offset", + "date_offset_bytes", "name", "comment", "synthetic", + "raw_manifest", "author_fullname", "author_name", "author_email", diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -524,7 +524,7 @@ # Copy directory ids dirs_missing_dict = ({"id": dir} for dir in dirs_missing) db.mktemp("directory", cur) - db.copy_to(dirs_missing_dict, "tmp_directory", ["id"], cur) + db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur) # Copy entries for entry_type, entry_list in dir_entries.items(): diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql --- a/swh/storage/sql/30-schema.sql +++ b/swh/storage/sql/30-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(178, now(), 'Work In Progress'); + values(180, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -137,7 +137,8 @@ dir_entries bigint[], -- sub-directories, reference directory_entry_dir file_entries bigint[], -- contained files, reference directory_entry_file rev_entries bigint[], -- mounted revisions, reference directory_entry_rev - object_id bigserial -- short object identifier + object_id bigserial, -- short object identifier + raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields ); comment on table directory is 'Contents of a directory, synonymous to tree (git)'; @@ -146,6 +147,7 @@ comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; comment on column directory.object_id is 'Short object identifier'; +comment on column directory.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- A directory entry pointing to a (sub-)directory. @@ -240,7 +242,10 @@ object_id bigserial, date_neg_utc_offset boolean, committer_date_neg_utc_offset boolean, - extra_headers bytea[][] not null -- extra headers (used in hash computation) + extra_headers bytea[][] not null, -- extra headers (used in hash computation) + date_offset_bytes bytea, + committer_date_offset_bytes bytea, + raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields ); comment on table revision is 'A revision represents the state of a source code tree at a specific point in time'; @@ -260,6 +265,9 @@ comment on column revision.metadata is 'Extra revision metadata'; comment on column revision.object_id is 'Non-intrinsic, sequential object identifier'; comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation'; +comment on column revision.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column revision.committer_date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column revision.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- either this table or the sha1_git[] column on the revision table @@ -378,7 +386,9 @@ synthetic boolean not null default false, -- true iff release has been created by Software Heritage object_id bigserial, target_type object_type not null, - date_neg_utc_offset boolean + date_neg_utc_offset boolean, + date_offset_bytes bytea, + raw_manifest bytea ); comment on table release is 'Details of a software release, synonymous with @@ -395,6 +405,8 @@ comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'', ''release'', ''snapshot'')'; comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; +comment on column release.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``'; +comment on column release.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields'; -- Tools create table metadata_fetcher diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -264,8 +264,8 @@ perform swh_directory_entry_add('dir'); perform swh_directory_entry_add('rev'); - insert into directory - select * from tmp_directory t + insert into directory (id, dir_entries, file_entries, rev_entries, raw_manifest) + select id, dir_entries, file_entries, rev_entries, raw_manifest from tmp_directory t where not exists ( select 1 from directory d where d.id = t.id); @@ -476,9 +476,11 @@ date timestamptz, date_offset smallint, date_neg_utc_offset boolean, + date_offset_bytes bytea, committer_date timestamptz, committer_date_offset smallint, committer_date_neg_utc_offset boolean, + committer_date_offset_bytes bytea, type revision_type, directory sha1_git, message bytea, @@ -494,7 +496,8 @@ synthetic boolean, parents bytea[], object_id bigint, - extra_headers bytea[][] + extra_headers bytea[][], + raw_manifest bytea ); @@ -505,12 +508,13 @@ language sql stable as $$ - select t.id, r.date, r.date_offset, r.date_neg_utc_offset, - r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, + select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, r.type, r.directory, r.message, a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, - r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers + r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers, + r.raw_manifest from swh_revision_list(root_revisions, num_revs) as t left join revision r on t.id = r.id left join person a on a.id = r.author @@ -567,8 +571,8 @@ begin perform swh_person_add_from_revision(); - insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers) - select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers + insert into revision (id, date, date_offset, date_neg_utc_offset, date_offset_bytes, committer_date, committer_date_offset, committer_date_neg_utc_offset, committer_date_offset_bytes, type, directory, message, author, committer, metadata, synthetic, extra_headers, raw_manifest) + select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.committer_date_offset_bytes, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers, t.raw_manifest from tmp_revision t left join person a on a.fullname = t.author_fullname left join person c on c.fullname = t.committer_fullname; @@ -623,8 +627,8 @@ begin perform swh_person_add_from_release(); - insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic) - select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic + insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, date_offset_bytes, name, comment, author, synthetic, raw_manifest) + select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.name, t.comment, a.id, t.synthetic, t.raw_manifest from tmp_release t left join person a on a.fullname = t.author_fullname where not exists (select 1 from release where t.id = release.id); @@ -850,12 +854,12 @@ select * from revision where object_id > min_excl and object_id <= max_incl ) - select r.id, r.date, r.date_offset, r.date_neg_utc_offset, - r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, + select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes, + r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes, r.type, r.directory, r.message, a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic, array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) - as parents, r.object_id, r.extra_headers + as parents, r.object_id, r.extra_headers, r.raw_manifest from revs r left join person a on a.id = r.author left join person c on c.id = r.committer diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql --- a/swh/storage/sql/60-indexes.sql +++ b/swh/storage/sql/60-indexes.sql @@ -124,15 +124,30 @@ add constraint revision_date_neg_utc_offset_not_null check (date is null or date_neg_utc_offset is not null) not valid; + alter table revision add constraint revision_committer_date_neg_utc_offset_not_null check (committer_date is null or committer_date_neg_utc_offset is not null) not valid; + alter table revision + add constraint revision_date_offset_not_null + check (date is null or date_offset_bytes is not null) + not valid; + + alter table revision + add constraint revision_committer_date_offset_not_null + check (committer_date is null or committer_date_offset_bytes is not null) + not valid; + alter table revision validate constraint revision_date_neg_utc_offset_not_null; alter table revision validate constraint revision_committer_date_neg_utc_offset_not_null; + alter table revision + validate constraint revision_date_offset_not_null; + alter table revision + validate constraint revision_committer_date_offset_not_null; \endif \if :dbflavor_default @@ -234,8 +249,15 @@ check (date is null or date_neg_utc_offset is not null) not valid; + alter table release + add constraint release_date_offset_not_null + check (date is null or date_offset_bytes is not null) + not valid; + alter table release validate constraint release_date_neg_utc_offset_not_null; + alter table release + validate constraint release_date_offset_not_null; -- if the author is null, then the date must be null alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid; diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -31,6 +31,7 @@ Person, RawExtrinsicMetadata, Revision, + RevisionType, SkippedContent, Snapshot, SnapshotBranch, @@ -732,9 +733,16 @@ swh_storage.directory_add(directories) for directory in directories: - assert swh_storage.directory_get_entries(directory.id) == PagedResult( - results=list(directory.entries), next_page_token=None, - ) + if directory.raw_manifest is None: + assert swh_storage.directory_get_entries(directory.id) == PagedResult( + results=list(directory.entries), next_page_token=None, + ) + else: + # TODO: compare the manifests are the same (currently, we can't + # because there is no way to get the raw_manifest of a directory) + # we can't compare the other fields, because they become non-intrinsic, + # so they may clash between hypothesis runs + pass def test_directory_add_twice(self, swh_storage, sample_data): directory = sample_data.directories[1] @@ -1069,6 +1077,25 @@ assert swh_storage.revision_get([revision.id])[0] == revision + def test_revision_add_with_raw_manifest(self, swh_storage, sample_data): + revision = sample_data.revision + revision = attr.evolve(revision, raw_manifest=b"foo") + revision = attr.evolve(revision, id=revision.compute_hash()) + init_missing = swh_storage.revision_missing([revision.id]) + assert list(init_missing) == [revision.id] + + actual_result = swh_storage.revision_add([revision]) + assert actual_result == {"revision:add": 1} + + end_missing = swh_storage.revision_missing([revision.id]) + assert list(end_missing) == [] + + assert list(swh_storage.journal_writer.journal.objects) == [ + ("revision", revision) + ] + + assert swh_storage.revision_get([revision.id]) == [revision] + @settings( suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large] + function_scoped_fixture_check, @@ -1086,14 +1113,21 @@ metadata=None, committer=attr.evolve(revision.committer, name=None, email=None), author=attr.evolve(revision.author, name=None, email=None), + type=RevisionType.GIT, ) for revision in revisions ] swh_storage.revision_add(revisions) - revs = swh_storage.revision_get([revision.id for revision in revisions]) - assert set(revs) == set(revisions) + for revision in revisions: + (rev,) = swh_storage.revision_get([revision.id]) + if rev.raw_manifest is None: + assert rev == revision + else: + assert rev.raw_manifest == revision.raw_manifest + # we can't compare the other fields, because they become non-intrinsic, + # so they may clash between hypothesis runs def test_revision_add_name_clash(self, swh_storage, sample_data): revision, revision2 = sample_data.revisions[:2] @@ -1522,6 +1556,26 @@ swh_storage.refresh_stat_counters() assert swh_storage.stat_counters()["release"] == 2 + def test_release_add_with_raw_manifest(self, swh_storage, sample_data): + release = sample_data.releases[0] + release = attr.evolve(release, raw_manifest=b"foo") + release = attr.evolve(release, id=release.compute_hash()) + + init_missing = swh_storage.release_missing([release.id]) + assert list(init_missing) == [release.id] + + actual_result = swh_storage.release_add([release]) + assert actual_result == {"release:add": 1} + + end_missing = swh_storage.release_missing([release.id]) + assert list(end_missing) == [] + + assert list(swh_storage.journal_writer.journal.objects) == [ + ("release", release), + ] + + assert swh_storage.release_get([release.id]) == [release] + @settings( suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large] + function_scoped_fixture_check, @@ -1543,9 +1597,14 @@ ] swh_storage.release_add(releases) - assert set( - swh_storage.release_get([release.id for release in releases]) - ) == set(releases) + for release in releases: + (rev,) = swh_storage.release_get([release.id]) + if rev.raw_manifest is None: + assert rev == release + else: + assert rev.raw_manifest == release.raw_manifest + # we can't compare the other fields, because they become non-intrinsic, + # so they may clash between hypothesis runs def test_release_add_no_author_date(self, swh_storage, sample_data): full_release = sample_data.release diff --git a/swh/storage/tests/test_backfill.py b/swh/storage/tests/test_backfill.py --- a/swh/storage/tests/test_backfill.py +++ b/swh/storage/tests/test_backfill.py @@ -151,6 +151,7 @@ "date", "date_offset", "date_neg_utc_offset", + "date_offset_bytes", "comment", "name", "synthetic", @@ -160,12 +161,13 @@ "author_name", "author_email", "author_fullname", + "raw_manifest", ] assert ( query == """ -select release.id as id,date,date_offset,date_neg_utc_offset,comment,release.name as name,synthetic,target,target_type,a.id as author_id,a.name as author_name,a.email as author_email,a.fullname as author_fullname +select release.id as id,date,date_offset,date_neg_utc_offset,date_offset_bytes,comment,release.name as name,synthetic,target,target_type,a.id as author_id,a.name as author_name,a.email as author_email,a.fullname as author_fullname,raw_manifest from release left join person a on release.author=a.id where (release.id) >= %s and (release.id) < %s diff --git a/swh/storage/tests/test_postgresql_converters.py b/swh/storage/tests/test_postgresql_converters.py --- a/swh/storage/tests/test_postgresql_converters.py +++ b/swh/storage/tests/test_postgresql_converters.py @@ -23,17 +23,27 @@ @pytest.mark.parametrize( "model_date,db_date", [ - (None, {"timestamp": None, "offset": 0, "neg_utc_offset": None}), + ( + None, + { + "timestamp": None, + "offset": 0, + "neg_utc_offset": None, + "offset_bytes": None, + }, + ), ( TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset=120, negative_utc=False, + offset_bytes=b"+0200", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, + "offset_bytes": b"+0200", }, ), ( @@ -41,11 +51,13 @@ timestamp=Timestamp(seconds=1123456789, microseconds=0,), offset=0, negative_utc=True, + offset_bytes=b"-0000", ), { "timestamp": "2005-08-07T23:19:49+00:00", "offset": 0, "neg_utc_offset": True, + "offset_bytes": b"-0000", }, ), ( @@ -53,11 +65,13 @@ timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset=42, negative_utc=False, + offset_bytes=b"+0042", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 42, "neg_utc_offset": False, + "offset_bytes": b"+0042", }, ), ( @@ -65,11 +79,13 @@ timestamp=Timestamp(seconds=1634366813, microseconds=0,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "2021-10-16T06:46:53+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -77,11 +93,13 @@ timestamp=Timestamp(seconds=0, microseconds=0,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1970-01-01T00:00:00+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -89,11 +107,13 @@ timestamp=Timestamp(seconds=0, microseconds=1,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1970-01-01T00:00:00.000001+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -101,11 +121,13 @@ timestamp=Timestamp(seconds=-1, microseconds=0,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:59:59+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -113,11 +135,13 @@ timestamp=Timestamp(seconds=-1, microseconds=1,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:59:59.000001+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -125,11 +149,13 @@ timestamp=Timestamp(seconds=-3600, microseconds=0,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:00:00+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", }, ), ( @@ -137,11 +163,27 @@ timestamp=Timestamp(seconds=-3600, microseconds=1,), offset=-120, negative_utc=False, + offset_bytes=b"-0200", ), { "timestamp": "1969-12-31T23:00:00.000001+00:00", "offset": -120, "neg_utc_offset": False, + "offset_bytes": b"-0200", + }, + ), + ( + TimestampWithTimezone( + timestamp=Timestamp(seconds=1234567890, microseconds=0,), + offset=120, + negative_utc=False, + offset_bytes=b"+200", + ), + { + "timestamp": "2009-02-13T23:31:30+00:00", + "offset": 120, + "neg_utc_offset": False, + "offset_bytes": b"+200", }, ), ], @@ -155,6 +197,7 @@ else datetime.datetime.fromisoformat(db_date["timestamp"]), offset=db_date["offset"], neg_utc_offset=db_date["neg_utc_offset"], + offset_bytes=db_date["offset_bytes"], ) == model_date ) @@ -184,9 +227,11 @@ "date": None, "date_offset": None, "date_neg_utc_offset": None, + "date_offset_bytes": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, + "committer_date_offset_bytes": None, "type": "git", "directory": b"dir-sha1", "message": b"commit message", @@ -199,6 +244,7 @@ "metadata": {}, "synthetic": False, "extra_headers": (), + "raw_manifest": None, "parents": [b"123", b"456"], } ) @@ -234,12 +280,14 @@ "date": None, "date_offset": None, "date_neg_utc_offset": None, + "date_offset_bytes": None, "name": b"release-name", "comment": b"release comment", "synthetic": True, "author_fullname": b"auth-fullname", "author_name": b"auth-name", "author_email": b"auth-email", + "raw_manifest": None, } )