Page MenuHomeSoftware Heritage

D6848.id24903.diff
No OneTemporary

D6848.id24903.diff

diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
swh.core[db,http] >= 0.14.0
swh.counters >= v0.8.0
-swh.model >= 2.1.0
+swh.model >= 4.0.0
swh.objstorage >= 0.2.2
diff --git a/sql/upgrades/179.sql b/sql/upgrades/179.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/179.sql
@@ -0,0 +1,159 @@
+-- SWH DB schema upgrade
+-- from_version: 178
+-- to_version: 179
+-- description: add {,committer_}date_offset_bytes to rev/rel + raw_manifest to dir/rev/rel, part 1
+
+insert into dbversion(version, release, description)
+ values(179, now(), 'Work In Progress');
+
+-- step 1: add columns, update functions
+
+alter table release
+ add column date_offset_bytes bytea,
+ add column raw_manifest bytea;
+comment on column release.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column release.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
+
+
+alter table revision
+ add column date_offset_bytes bytea,
+ add column committer_date_offset_bytes bytea,
+ add column raw_manifest bytea;
+comment on column revision.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column revision.committer_date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column revision.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
+
+drop function swh_revision_log;
+drop function swh_revision_list_by_object_id;
+drop function swh_revision_add;
+drop type revision_entry;
+create type revision_entry as
+(
+ id sha1_git,
+ date timestamptz,
+ date_offset smallint,
+ date_neg_utc_offset boolean,
+ date_offset_bytes bytea,
+ committer_date timestamptz,
+ committer_date_offset smallint,
+ committer_date_neg_utc_offset boolean,
+ committer_date_offset_bytes bytea,
+ type revision_type,
+ directory sha1_git,
+ message bytea,
+ author_id bigint,
+ author_fullname bytea,
+ author_name bytea,
+ author_email bytea,
+ committer_id bigint,
+ committer_fullname bytea,
+ committer_name bytea,
+ committer_email bytea,
+ metadata jsonb,
+ synthetic boolean,
+ parents bytea[],
+ object_id bigint,
+ extra_headers bytea[][],
+ raw_manifest bytea
+);
+
+alter table directory
+ add column raw_manifest bytea;
+comment on column directory.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
+
+create or replace function swh_directory_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_directory_entry_add('file');
+ perform swh_directory_entry_add('dir');
+ perform swh_directory_entry_add('rev');
+
+ insert into directory (id, dir_entries, file_entries, rev_entries, raw_manifest)
+ select id, dir_entries, file_entries, rev_entries, raw_manifest from tmp_directory t
+ where not exists (
+ select 1 from directory d
+ where d.id = t.id);
+
+ return;
+end
+$$;
+
+create or replace function swh_revision_log(root_revisions bytea[], num_revs bigint default NULL)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email,
+ c.id, c.fullname, c.name, c.email,
+ r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers,
+ r.raw_manifest
+ from swh_revision_list(root_revisions, num_revs) as t
+ left join revision r on t.id = r.id
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer;
+$$;
+
+create or replace function swh_revision_list_by_object_id(
+ min_excl bigint,
+ max_incl bigint
+)
+ returns setof revision_entry
+ language sql
+ stable
+as $$
+ with revs as (
+ select * from revision
+ where object_id > min_excl and object_id <= max_incl
+ )
+ select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes,
+ r.type, r.directory, r.message,
+ a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
+ array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
+ as parents, r.object_id, r.extra_headers, r.raw_manifest
+ from revs r
+ left join person a on a.id = r.author
+ left join person c on c.id = r.committer
+ order by r.object_id;
+$$;
+
+create or replace function swh_revision_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_revision();
+
+ insert into revision (id, date, date_offset, date_neg_utc_offset, date_offset_bytes, committer_date, committer_date_offset, committer_date_neg_utc_offset, committer_date_offset_bytes, type, directory, message, author, committer, metadata, synthetic, extra_headers, raw_manifest)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.committer_date_offset_bytes, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers, t.raw_manifest
+ from tmp_revision t
+ left join person a on a.fullname = t.author_fullname
+ left join person c on c.fullname = t.committer_fullname;
+ return;
+end
+$$;
+
+create or replace function swh_release_add()
+ returns void
+ language plpgsql
+as $$
+begin
+ perform swh_person_add_from_release();
+
+ insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, date_offset_bytes, name, comment, author, synthetic, raw_manifest)
+ select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.name, t.comment, a.id, t.synthetic, t.raw_manifest
+ from tmp_release t
+ left join person a on a.fullname = t.author_fullname
+ where not exists (select 1 from release where t.id = release.id);
+ return;
+end
+$$;
+
+-- step 2: upgrade python code to start writing to them
+
+-- data migrations in 180.sql
diff --git a/sql/upgrades/180.sql b/sql/upgrades/180.sql
new file mode 100644
--- /dev/null
+++ b/sql/upgrades/180.sql
@@ -0,0 +1,88 @@
+-- SWH DB schema upgrade
+-- from_version: 177
+-- to_version: 180
+-- description: add {,committer_}date_offset_bytes to rev/rel + raw_manifest to dir/rev/rel, part 2
+
+insert into dbversion(version, release, description)
+ values(180, now(), 'Work In Progress');
+
+-- copied from 60-indexes.sql
+select swh_get_dbflavor() != 'read_replica' as dbflavor_does_deduplication \gset
+
+-- step 3: fill the offsets
+
+create or replace function _format_offset(offset_ smallint, neg_utc_offset bool)
+ returns bytea
+ language plpgsql
+as $$
+ begin
+ return convert_to(
+ -- sign
+ case when offset_ < 0 or neg_utc_offset then '-' else '+' end
+ -- hours (unfortunately we can't use lpad because it truncates)
+ || case when abs(offset_) >= 600 then
+ cast(abs(offset_) / 60 as text)
+ else
+ '0' || cast(abs(offset_) / 60 as text)
+ end
+ -- minutes
+ || lpad(cast(mod(abs(offset_), 60) as text), 2, '0'),
+ 'utf8'
+ );
+ end
+$$;
+
+-- make sure it's correct
+do $$ begin
+ assert (select _format_offset(NULL::smallint, NULL::bool)) is not distinct from NULL;
+ assert (select _format_offset(0::smallint, false)) = '+0000'::bytea;
+ assert (select _format_offset(0::smallint, true)) = '-0000'::bytea;
+ assert (select _format_offset(1::smallint, false)) = '+0001'::bytea;
+ assert (select _format_offset(-1::smallint, false)) = '-0001'::bytea;
+ assert (select _format_offset(120::smallint, false)) = '+0200'::bytea;
+ assert (select _format_offset(-120::smallint, false)) = '-0200'::bytea;
+ assert (select _format_offset(6000::smallint, false)) = '+10000'::bytea;
+ assert (select _format_offset(-6000::smallint, false)) = '-10000'::bytea;
+end$$;
+
+update release
+ set date_offset_bytes=_format_offset(date_offset, date_neg_utc_offset)
+ where date is not null and date_offset_bytes is null;
+
+update revision
+ set date_offset_bytes=_format_offset(date_offset, date_neg_utc_offset),
+ committer_date_offset_bytes=_format_offset(committer_date_offset, committer_date_neg_utc_offset)
+ where (date is not null and date_offset_bytes is null)
+ or (committer_date is not null and committer_date_offset_bytes is null);
+
+
+-- step 4: add integrity constraints
+
+\if :dbflavor_does_deduplication
+
+ -- add new constraint on release dates
+ alter table release
+ add constraint release_date_offset_not_null
+ check (date is null or date_offset_bytes is not null)
+ not valid;
+ alter table release
+ validate constraint release_date_offset_not_null;
+
+ alter table revision
+ add constraint revision_date_offset_not_null
+ check (date is null or date_offset_bytes is not null)
+ not valid;
+
+ alter table revision
+ add constraint revision_committer_date_offset_not_null
+ check (committer_date is null or committer_date_offset_bytes is not null)
+ not valid;
+
+ alter table revision
+ validate constraint revision_date_offset_not_null;
+ alter table revision
+ validate constraint revision_committer_date_offset_not_null;
+\endif
+
+-- step 5: remove the old columns (date_offset and date_neg_utc_offset): in a future migration...
+
diff --git a/swh/storage/backfill.py b/swh/storage/backfill.py
--- a/swh/storage/backfill.py
+++ b/swh/storage/backfill.py
@@ -79,7 +79,7 @@
"status",
"reason",
],
- "directory": ["id", "dir_entries", "file_entries", "rev_entries"],
+ "directory": ["id", "dir_entries", "file_entries", "rev_entries", "raw_manifest"],
"extid": ["extid_type", "extid", "extid_version", "target_type", "target"],
"metadata_authority": ["type", "url"],
"metadata_fetcher": ["name", "version"],
@@ -117,9 +117,11 @@
"date",
"date_offset",
"date_neg_utc_offset",
+ "date_offset_bytes",
"committer_date",
"committer_date_offset",
"committer_date_neg_utc_offset",
+ "committer_date_offset_bytes",
"type",
"directory",
"message",
@@ -131,6 +133,7 @@
"where rh.id = revision.id order by rh.parent_rank asc)",
"parents",
),
+ "raw_manifest",
("a.id", "author_id"),
("a.name", "author_name"),
("a.email", "author_email"),
@@ -145,6 +148,7 @@
"date",
"date_offset",
"date_neg_utc_offset",
+ "date_offset_bytes",
"comment",
("release.name", "name"),
"synthetic",
@@ -154,6 +158,7 @@
("a.name", "author_name"),
("a.email", "author_email"),
("a.fullname", "author_fullname"),
+ "raw_manifest",
],
"snapshot": ["id", "object_id"],
}
diff --git a/swh/storage/cassandra/model.py b/swh/storage/cassandra/model.py
--- a/swh/storage/cassandra/model.py
+++ b/swh/storage/cassandra/model.py
@@ -114,6 +114,7 @@
PARTITION_KEY = ("id",)
id: bytes
+ raw_manifest: Optional[bytes]
@dataclasses.dataclass
@@ -145,6 +146,7 @@
synthetic: bool
metadata: str
extra_headers: dict
+ raw_manifest: Optional[bytes]
@dataclasses.dataclass
@@ -171,6 +173,7 @@
message: bytes
author: Person
synthetic: bool
+ raw_manifest: Optional[bytes]
@dataclasses.dataclass
diff --git a/swh/storage/cassandra/schema.py b/swh/storage/cassandra/schema.py
--- a/swh/storage/cassandra/schema.py
+++ b/swh/storage/cassandra/schema.py
@@ -56,13 +56,14 @@
"""
CREATE TYPE IF NOT EXISTS microtimestamp (
seconds bigint,
- microseconds int
+ microseconds int,
);""",
"""
CREATE TYPE IF NOT EXISTS microtimestamp_with_timezone (
timestamp frozen<microtimestamp>,
offset smallint,
- negative_utc boolean
+ negative_utc boolean,
+ offset_bytes blob,
);""",
"""
CREATE TYPE IF NOT EXISTS person (
@@ -110,8 +111,10 @@
-- true iff revision has been created by Software Heritage
metadata text,
-- extra metadata as JSON(tarball checksums, etc...)
- extra_headers frozen<list <list<blob>> >
+ extra_headers frozen<list <list<blob>> >,
-- extra commit information as (tuple(key, value), ...)
+ raw_manifest blob,
+ -- NULL if the object can be rebuild from other cells and revision_parent.
);""",
"""
CREATE TABLE IF NOT EXISTS revision_parent (
@@ -133,10 +136,14 @@
author person,
synthetic boolean,
-- true iff release has been created by Software Heritage
+ raw_manifest blob,
+ -- NULL if the object can be rebuild from other cells
);""",
"""
CREATE TABLE IF NOT EXISTS directory (
id blob PRIMARY KEY,
+ raw_manifest blob
+ -- NULL if the object can be rebuild from (sorted) entries
);""",
"""
CREATE TABLE IF NOT EXISTS directory_entry (
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -504,7 +504,9 @@
# Add the directory *after* adding all the entries, so someone
# calling snapshot_get_branch in the meantime won't end up
# with half the entries.
- self._cql_runner.directory_add_one(DirectoryRow(id=directory.id))
+ self._cql_runner.directory_add_one(
+ DirectoryRow(id=directory.id, raw_manifest=directory.raw_manifest)
+ )
return {"directory:add": len(directories)}
diff --git a/swh/storage/postgresql/converters.py b/swh/storage/postgresql/converters.py
--- a/swh/storage/postgresql/converters.py
+++ b/swh/storage/postgresql/converters.py
@@ -39,6 +39,7 @@
"timestamp": None,
"offset": 0,
"neg_utc_offset": None,
+ "offset_bytes": None,
}
@@ -84,7 +85,10 @@
def db_to_date(
- date: Optional[datetime.datetime], offset: int, neg_utc_offset: Optional[bool]
+ date: Optional[datetime.datetime],
+ offset: int,
+ neg_utc_offset: Optional[bool],
+ offset_bytes: Optional[bytes],
) -> Optional[TimestampWithTimezone]:
"""Convert the DB representation of a date to a swh-model compatible date.
@@ -105,6 +109,11 @@
# For older versions of the database that were not migrated to schema v160
neg_utc_offset = False
+ kwargs = {}
+ if offset_bytes:
+ # TODO: remove the conditional after migration is complete.
+ kwargs["offset_bytes"] = offset_bytes
+
return TimestampWithTimezone(
timestamp=Timestamp(
# we use floor() instead of int() to round down, because of negative dates
@@ -113,6 +122,7 @@
),
offset=offset,
negative_utc=neg_utc_offset,
+ **kwargs,
)
@@ -123,12 +133,14 @@
ts_with_tz: a TimestampWithTimezone object
Returns:
- dict: a dictionary with three keys:
+ dict: a dictionary with these keys:
- timestamp: a date in ISO format
- offset: the UTC offset in minutes
- neg_utc_offset: a boolean indicating whether a null offset is
negative or positive.
+ - offset_bytes: a byte representation of the latter two, usually as "+HHMM"
+ or "-HHMM"
"""
@@ -145,6 +157,7 @@
"timestamp": timestamp.isoformat(),
"offset": ts_with_tz.offset,
"neg_utc_offset": ts_with_tz.negative_utc,
+ "offset_bytes": ts_with_tz.offset_bytes,
}
@@ -165,18 +178,21 @@
"date": date["timestamp"],
"date_offset": date["offset"],
"date_neg_utc_offset": date["neg_utc_offset"],
+ "date_offset_bytes": date["offset_bytes"],
"committer_fullname": committer["fullname"],
"committer_name": committer["name"],
"committer_email": committer["email"],
"committer_date": committer_date["timestamp"],
"committer_date_offset": committer_date["offset"],
"committer_date_neg_utc_offset": committer_date["neg_utc_offset"],
+ "committer_date_offset_bytes": committer_date["offset_bytes"],
"type": revision.type.value,
"directory": revision.directory,
"message": revision.message,
"metadata": None if revision.metadata is None else dict(revision.metadata),
"synthetic": revision.synthetic,
"extra_headers": revision.extra_headers,
+ "raw_manifest": revision.raw_manifest,
"parents": [
{"id": revision.id, "parent_id": parent, "parent_rank": i,}
for i, parent in enumerate(revision.parents)
@@ -202,6 +218,7 @@
db_revision["date"],
db_revision["date_offset"],
db_revision["date_neg_utc_offset"],
+ db_revision["date_offset_bytes"],
)
committer = db_to_author(
@@ -213,6 +230,7 @@
db_revision["committer_date"],
db_revision["committer_date_offset"],
db_revision["committer_date_neg_utc_offset"],
+ db_revision["committer_date_offset_bytes"],
)
assert author, "author is None"
@@ -246,6 +264,7 @@
synthetic=db_revision["synthetic"],
extra_headers=extra_headers,
parents=tuple(parents),
+ raw_manifest=db_revision["raw_manifest"],
)
@@ -263,11 +282,13 @@
"date": date["timestamp"],
"date_offset": date["offset"],
"date_neg_utc_offset": date["neg_utc_offset"],
+ "date_offset_bytes": date["offset_bytes"],
"name": release.name,
"target": release.target,
"target_type": release.target_type.value,
"comment": release.message,
"synthetic": release.synthetic,
+ "raw_manifest": release.raw_manifest,
}
@@ -285,7 +306,10 @@
db_release["author_email"],
)
date = db_to_date(
- db_release["date"], db_release["date_offset"], db_release["date_neg_utc_offset"]
+ db_release["date"],
+ db_release["date_offset"],
+ db_release["date_neg_utc_offset"],
+ db_release["date_offset_bytes"],
)
return Release(
@@ -297,6 +321,7 @@
synthetic=db_release["synthetic"],
target=db_release["target"],
target_type=ObjectType(db_release["target_type"]),
+ raw_manifest=db_release["raw_manifest"],
)
diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py
--- a/swh/storage/postgresql/db.py
+++ b/swh/storage/postgresql/db.py
@@ -30,7 +30,7 @@
"""
- current_version = 178
+ current_version = 180
def mktemp_dir_entry(self, entry_type, cur=None):
self._cursor(cur).execute(
@@ -437,9 +437,11 @@
"date",
"date_offset",
"date_neg_utc_offset",
+ "date_offset_bytes",
"committer_date",
"committer_date_offset",
"committer_date_neg_utc_offset",
+ "committer_date_offset_bytes",
"type",
"directory",
"message",
@@ -452,6 +454,7 @@
"metadata",
"synthetic",
"extra_headers",
+ "raw_manifest",
]
revision_get_cols = revision_add_cols + ["parents"]
@@ -1200,9 +1203,11 @@
"date",
"date_offset",
"date_neg_utc_offset",
+ "date_offset_bytes",
"name",
"comment",
"synthetic",
+ "raw_manifest",
"author_fullname",
"author_name",
"author_email",
diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py
--- a/swh/storage/postgresql/storage.py
+++ b/swh/storage/postgresql/storage.py
@@ -524,7 +524,7 @@
# Copy directory ids
dirs_missing_dict = ({"id": dir} for dir in dirs_missing)
db.mktemp("directory", cur)
- db.copy_to(dirs_missing_dict, "tmp_directory", ["id"], cur)
+ db.copy_to(dirs_missing_dict, "tmp_directory", ["id", "raw_manifest"], cur)
# Copy entries
for entry_type, entry_list in dir_entries.items():
diff --git a/swh/storage/sql/30-schema.sql b/swh/storage/sql/30-schema.sql
--- a/swh/storage/sql/30-schema.sql
+++ b/swh/storage/sql/30-schema.sql
@@ -17,7 +17,7 @@
-- latest schema version
insert into dbversion(version, release, description)
- values(178, now(), 'Work In Progress');
+ values(180, now(), 'Work In Progress');
-- a SHA1 checksum
create domain sha1 as bytea check (length(value) = 20);
@@ -137,7 +137,8 @@
dir_entries bigint[], -- sub-directories, reference directory_entry_dir
file_entries bigint[], -- contained files, reference directory_entry_file
rev_entries bigint[], -- mounted revisions, reference directory_entry_rev
- object_id bigserial -- short object identifier
+ object_id bigserial, -- short object identifier
+ raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields
);
comment on table directory is 'Contents of a directory, synonymous to tree (git)';
@@ -146,6 +147,7 @@
comment on column directory.file_entries is 'Contained files, reference directory_entry_file';
comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev';
comment on column directory.object_id is 'Short object identifier';
+comment on column directory.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
-- A directory entry pointing to a (sub-)directory.
@@ -240,7 +242,10 @@
object_id bigserial,
date_neg_utc_offset boolean,
committer_date_neg_utc_offset boolean,
- extra_headers bytea[][] not null -- extra headers (used in hash computation)
+ extra_headers bytea[][] not null, -- extra headers (used in hash computation)
+ date_offset_bytes bytea,
+ committer_date_offset_bytes bytea,
+ raw_manifest bytea -- git manifest of the object, if it cannot be represented using only the other fields
);
comment on table revision is 'A revision represents the state of a source code tree at a specific point in time';
@@ -260,6 +265,9 @@
comment on column revision.metadata is 'Extra revision metadata';
comment on column revision.object_id is 'Non-intrinsic, sequential object identifier';
comment on column revision.extra_headers is 'Extra revision headers; used in revision hash computation';
+comment on column revision.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column revision.committer_date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column revision.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
-- either this table or the sha1_git[] column on the revision table
@@ -378,7 +386,9 @@
synthetic boolean not null default false, -- true iff release has been created by Software Heritage
object_id bigserial,
target_type object_type not null,
- date_neg_utc_offset boolean
+ date_neg_utc_offset boolean,
+ date_offset_bytes bytea,
+ raw_manifest bytea
);
comment on table release is 'Details of a software release, synonymous with
@@ -395,6 +405,8 @@
comment on column release.target_type is 'Object type (''content'', ''directory'', ''revision'',
''release'', ''snapshot'')';
comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp';
+comment on column release.date_offset_bytes is 'Raw git representation of the timezone, as an offset from UTC. It should follow this format: ``+HHMM`` or ``-HHMM``';
+comment on column release.raw_manifest is 'git manifest of the object, if it cannot be represented using only the other fields';
-- Tools
create table metadata_fetcher
diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql
--- a/swh/storage/sql/40-funcs.sql
+++ b/swh/storage/sql/40-funcs.sql
@@ -264,8 +264,8 @@
perform swh_directory_entry_add('dir');
perform swh_directory_entry_add('rev');
- insert into directory
- select * from tmp_directory t
+ insert into directory (id, dir_entries, file_entries, rev_entries, raw_manifest)
+ select id, dir_entries, file_entries, rev_entries, raw_manifest from tmp_directory t
where not exists (
select 1 from directory d
where d.id = t.id);
@@ -476,9 +476,11 @@
date timestamptz,
date_offset smallint,
date_neg_utc_offset boolean,
+ date_offset_bytes bytea,
committer_date timestamptz,
committer_date_offset smallint,
committer_date_neg_utc_offset boolean,
+ committer_date_offset_bytes bytea,
type revision_type,
directory sha1_git,
message bytea,
@@ -494,7 +496,8 @@
synthetic boolean,
parents bytea[],
object_id bigint,
- extra_headers bytea[][]
+ extra_headers bytea[][],
+ raw_manifest bytea
);
@@ -505,12 +508,13 @@
language sql
stable
as $$
- select t.id, r.date, r.date_offset, r.date_neg_utc_offset,
- r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ select t.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes,
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email,
c.id, c.fullname, c.name, c.email,
- r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers
+ r.metadata, r.synthetic, t.parents, r.object_id, r.extra_headers,
+ r.raw_manifest
from swh_revision_list(root_revisions, num_revs) as t
left join revision r on t.id = r.id
left join person a on a.id = r.author
@@ -567,8 +571,8 @@
begin
perform swh_person_add_from_revision();
- insert into revision (id, date, date_offset, date_neg_utc_offset, committer_date, committer_date_offset, committer_date_neg_utc_offset, type, directory, message, author, committer, metadata, synthetic, extra_headers)
- select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers
+ insert into revision (id, date, date_offset, date_neg_utc_offset, date_offset_bytes, committer_date, committer_date_offset, committer_date_neg_utc_offset, committer_date_offset_bytes, type, directory, message, author, committer, metadata, synthetic, extra_headers, raw_manifest)
+ select t.id, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.committer_date, t.committer_date_offset, t.committer_date_neg_utc_offset, t.committer_date_offset_bytes, t.type, t.directory, t.message, a.id, c.id, t.metadata, t.synthetic, t.extra_headers, t.raw_manifest
from tmp_revision t
left join person a on a.fullname = t.author_fullname
left join person c on c.fullname = t.committer_fullname;
@@ -623,8 +627,8 @@
begin
perform swh_person_add_from_release();
- insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic)
- select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic
+ insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, date_offset_bytes, name, comment, author, synthetic, raw_manifest)
+ select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.date_offset_bytes, t.name, t.comment, a.id, t.synthetic, t.raw_manifest
from tmp_release t
left join person a on a.fullname = t.author_fullname
where not exists (select 1 from release where t.id = release.id);
@@ -850,12 +854,12 @@
select * from revision
where object_id > min_excl and object_id <= max_incl
)
- select r.id, r.date, r.date_offset, r.date_neg_utc_offset,
- r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset,
+ select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.date_offset_bytes,
+ r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.committer_date_offset_bytes,
r.type, r.directory, r.message,
a.id, a.fullname, a.name, a.email, c.id, c.fullname, c.name, c.email, r.metadata, r.synthetic,
array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank)
- as parents, r.object_id, r.extra_headers
+ as parents, r.object_id, r.extra_headers, r.raw_manifest
from revs r
left join person a on a.id = r.author
left join person c on c.id = r.committer
diff --git a/swh/storage/sql/60-indexes.sql b/swh/storage/sql/60-indexes.sql
--- a/swh/storage/sql/60-indexes.sql
+++ b/swh/storage/sql/60-indexes.sql
@@ -124,15 +124,30 @@
add constraint revision_date_neg_utc_offset_not_null
check (date is null or date_neg_utc_offset is not null)
not valid;
+
alter table revision
add constraint revision_committer_date_neg_utc_offset_not_null
check (committer_date is null or committer_date_neg_utc_offset is not null)
not valid;
+ alter table revision
+ add constraint revision_date_offset_not_null
+ check (date is null or date_offset_bytes is not null)
+ not valid;
+
+ alter table revision
+ add constraint revision_committer_date_offset_not_null
+ check (committer_date is null or committer_date_offset_bytes is not null)
+ not valid;
+
alter table revision
validate constraint revision_date_neg_utc_offset_not_null;
alter table revision
validate constraint revision_committer_date_neg_utc_offset_not_null;
+ alter table revision
+ validate constraint revision_date_offset_not_null;
+ alter table revision
+ validate constraint revision_committer_date_offset_not_null;
\endif
\if :dbflavor_default
@@ -234,8 +249,15 @@
check (date is null or date_neg_utc_offset is not null)
not valid;
+ alter table release
+ add constraint release_date_offset_not_null
+ check (date is null or date_offset_bytes is not null)
+ not valid;
+
alter table release
validate constraint release_date_neg_utc_offset_not_null;
+ alter table release
+ validate constraint release_date_offset_not_null;
-- if the author is null, then the date must be null
alter table release add constraint release_author_date_check check ((date is null) or (author is not null)) not valid;
diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
--- a/swh/storage/tests/storage_tests.py
+++ b/swh/storage/tests/storage_tests.py
@@ -31,6 +31,7 @@
Person,
RawExtrinsicMetadata,
Revision,
+ RevisionType,
SkippedContent,
Snapshot,
SnapshotBranch,
@@ -721,6 +722,36 @@
swh_storage.refresh_stat_counters()
assert swh_storage.stat_counters()["directory"] == 1
+ def test_directory_add_with_raw_manifest(self, swh_storage, sample_data):
+ content = sample_data.content
+ directory = sample_data.directory
+ directory = attr.evolve(directory, raw_manifest=b"foo")
+ directory = attr.evolve(directory, id=directory.compute_hash())
+
+ assert directory.entries[0].target == content.sha1_git
+ swh_storage.content_add([content])
+
+ init_missing = list(swh_storage.directory_missing([directory.id]))
+ assert [directory.id] == init_missing
+
+ actual_result = swh_storage.directory_add([directory])
+ assert actual_result == {"directory:add": 1}
+
+ assert ("directory", directory) in list(
+ swh_storage.journal_writer.journal.objects
+ )
+
+ actual_data = list(swh_storage.directory_ls(directory.id))
+ expected_data = list(transform_entries(swh_storage, directory))
+
+ for data in actual_data:
+ assert data in expected_data
+
+ after_missing = list(swh_storage.directory_missing([directory.id]))
+ assert after_missing == []
+
+ # TODO: check the recorded manifest
+
@settings(
suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]
+ function_scoped_fixture_check,
@@ -732,9 +763,16 @@
swh_storage.directory_add(directories)
for directory in directories:
- assert swh_storage.directory_get_entries(directory.id) == PagedResult(
- results=list(directory.entries), next_page_token=None,
- )
+ if directory.raw_manifest is None:
+ assert swh_storage.directory_get_entries(directory.id) == PagedResult(
+ results=list(directory.entries), next_page_token=None,
+ )
+ else:
+ # TODO: compare the manifests are the same (currently, we can't
+ # because there is no way to get the raw_manifest of a directory)
+ # we can't compare the other fields, because they become non-intrinsic,
+ # so they may clash between hypothesis runs
+ pass
def test_directory_add_twice(self, swh_storage, sample_data):
directory = sample_data.directories[1]
@@ -1069,6 +1107,25 @@
assert swh_storage.revision_get([revision.id])[0] == revision
+ def test_revision_add_with_raw_manifest(self, swh_storage, sample_data):
+ revision = sample_data.revision
+ revision = attr.evolve(revision, raw_manifest=b"foo")
+ revision = attr.evolve(revision, id=revision.compute_hash())
+ init_missing = swh_storage.revision_missing([revision.id])
+ assert list(init_missing) == [revision.id]
+
+ actual_result = swh_storage.revision_add([revision])
+ assert actual_result == {"revision:add": 1}
+
+ end_missing = swh_storage.revision_missing([revision.id])
+ assert list(end_missing) == []
+
+ assert list(swh_storage.journal_writer.journal.objects) == [
+ ("revision", revision)
+ ]
+
+ assert swh_storage.revision_get([revision.id]) == [revision]
+
@settings(
suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]
+ function_scoped_fixture_check,
@@ -1086,14 +1143,21 @@
metadata=None,
committer=attr.evolve(revision.committer, name=None, email=None),
author=attr.evolve(revision.author, name=None, email=None),
+ type=RevisionType.GIT,
)
for revision in revisions
]
swh_storage.revision_add(revisions)
- revs = swh_storage.revision_get([revision.id for revision in revisions])
- assert set(revs) == set(revisions)
+ for revision in revisions:
+ (rev,) = swh_storage.revision_get([revision.id])
+ if rev.raw_manifest is None:
+ assert rev == revision
+ else:
+ assert rev.raw_manifest == revision.raw_manifest
+ # we can't compare the other fields, because they become non-intrinsic,
+ # so they may clash between hypothesis runs
def test_revision_add_name_clash(self, swh_storage, sample_data):
revision, revision2 = sample_data.revisions[:2]
@@ -1522,6 +1586,26 @@
swh_storage.refresh_stat_counters()
assert swh_storage.stat_counters()["release"] == 2
+ def test_release_add_with_raw_manifest(self, swh_storage, sample_data):
+ release = sample_data.releases[0]
+ release = attr.evolve(release, raw_manifest=b"foo")
+ release = attr.evolve(release, id=release.compute_hash())
+
+ init_missing = swh_storage.release_missing([release.id])
+ assert list(init_missing) == [release.id]
+
+ actual_result = swh_storage.release_add([release])
+ assert actual_result == {"release:add": 1}
+
+ end_missing = swh_storage.release_missing([release.id])
+ assert list(end_missing) == []
+
+ assert list(swh_storage.journal_writer.journal.objects) == [
+ ("release", release),
+ ]
+
+ assert swh_storage.release_get([release.id]) == [release]
+
@settings(
suppress_health_check=[HealthCheck.too_slow, HealthCheck.data_too_large]
+ function_scoped_fixture_check,
@@ -1543,9 +1627,14 @@
]
swh_storage.release_add(releases)
- assert set(
- swh_storage.release_get([release.id for release in releases])
- ) == set(releases)
+ for release in releases:
+ (rev,) = swh_storage.release_get([release.id])
+ if rev.raw_manifest is None:
+ assert rev == release
+ else:
+ assert rev.raw_manifest == release.raw_manifest
+ # we can't compare the other fields, because they become non-intrinsic,
+ # so they may clash between hypothesis runs
def test_release_add_no_author_date(self, swh_storage, sample_data):
full_release = sample_data.release
diff --git a/swh/storage/tests/test_backfill.py b/swh/storage/tests/test_backfill.py
--- a/swh/storage/tests/test_backfill.py
+++ b/swh/storage/tests/test_backfill.py
@@ -151,6 +151,7 @@
"date",
"date_offset",
"date_neg_utc_offset",
+ "date_offset_bytes",
"comment",
"name",
"synthetic",
@@ -160,12 +161,13 @@
"author_name",
"author_email",
"author_fullname",
+ "raw_manifest",
]
assert (
query
== """
-select release.id as id,date,date_offset,date_neg_utc_offset,comment,release.name as name,synthetic,target,target_type,a.id as author_id,a.name as author_name,a.email as author_email,a.fullname as author_fullname
+select release.id as id,date,date_offset,date_neg_utc_offset,date_offset_bytes,comment,release.name as name,synthetic,target,target_type,a.id as author_id,a.name as author_name,a.email as author_email,a.fullname as author_fullname,raw_manifest
from release
left join person a on release.author=a.id
where (release.id) >= %s and (release.id) < %s
diff --git a/swh/storage/tests/test_postgresql_converters.py b/swh/storage/tests/test_postgresql_converters.py
--- a/swh/storage/tests/test_postgresql_converters.py
+++ b/swh/storage/tests/test_postgresql_converters.py
@@ -23,17 +23,27 @@
@pytest.mark.parametrize(
"model_date,db_date",
[
- (None, {"timestamp": None, "offset": 0, "neg_utc_offset": None}),
+ (
+ None,
+ {
+ "timestamp": None,
+ "offset": 0,
+ "neg_utc_offset": None,
+ "offset_bytes": None,
+ },
+ ),
(
TimestampWithTimezone(
timestamp=Timestamp(seconds=1234567890, microseconds=0,),
offset=120,
negative_utc=False,
+ offset_bytes=b"+0200",
),
{
"timestamp": "2009-02-13T23:31:30+00:00",
"offset": 120,
"neg_utc_offset": False,
+ "offset_bytes": b"+0200",
},
),
(
@@ -41,11 +51,13 @@
timestamp=Timestamp(seconds=1123456789, microseconds=0,),
offset=0,
negative_utc=True,
+ offset_bytes=b"-0000",
),
{
"timestamp": "2005-08-07T23:19:49+00:00",
"offset": 0,
"neg_utc_offset": True,
+ "offset_bytes": b"-0000",
},
),
(
@@ -53,11 +65,13 @@
timestamp=Timestamp(seconds=1234567890, microseconds=0,),
offset=42,
negative_utc=False,
+ offset_bytes=b"+0042",
),
{
"timestamp": "2009-02-13T23:31:30+00:00",
"offset": 42,
"neg_utc_offset": False,
+ "offset_bytes": b"+0042",
},
),
(
@@ -65,11 +79,13 @@
timestamp=Timestamp(seconds=1634366813, microseconds=0,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "2021-10-16T06:46:53+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -77,11 +93,13 @@
timestamp=Timestamp(seconds=0, microseconds=0,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1970-01-01T00:00:00+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -89,11 +107,13 @@
timestamp=Timestamp(seconds=0, microseconds=1,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1970-01-01T00:00:00.000001+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -101,11 +121,13 @@
timestamp=Timestamp(seconds=-1, microseconds=0,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1969-12-31T23:59:59+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -113,11 +135,13 @@
timestamp=Timestamp(seconds=-1, microseconds=1,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1969-12-31T23:59:59.000001+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -125,11 +149,13 @@
timestamp=Timestamp(seconds=-3600, microseconds=0,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1969-12-31T23:00:00+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
},
),
(
@@ -137,11 +163,27 @@
timestamp=Timestamp(seconds=-3600, microseconds=1,),
offset=-120,
negative_utc=False,
+ offset_bytes=b"-0200",
),
{
"timestamp": "1969-12-31T23:00:00.000001+00:00",
"offset": -120,
"neg_utc_offset": False,
+ "offset_bytes": b"-0200",
+ },
+ ),
+ (
+ TimestampWithTimezone(
+ timestamp=Timestamp(seconds=1234567890, microseconds=0,),
+ offset=120,
+ negative_utc=False,
+ offset_bytes=b"+200",
+ ),
+ {
+ "timestamp": "2009-02-13T23:31:30+00:00",
+ "offset": 120,
+ "neg_utc_offset": False,
+ "offset_bytes": b"+200",
},
),
],
@@ -155,6 +197,7 @@
else datetime.datetime.fromisoformat(db_date["timestamp"]),
offset=db_date["offset"],
neg_utc_offset=db_date["neg_utc_offset"],
+ offset_bytes=db_date["offset_bytes"],
)
== model_date
)
@@ -184,9 +227,11 @@
"date": None,
"date_offset": None,
"date_neg_utc_offset": None,
+ "date_offset_bytes": None,
"committer_date": None,
"committer_date_offset": None,
"committer_date_neg_utc_offset": None,
+ "committer_date_offset_bytes": None,
"type": "git",
"directory": b"dir-sha1",
"message": b"commit message",
@@ -199,6 +244,7 @@
"metadata": {},
"synthetic": False,
"extra_headers": (),
+ "raw_manifest": None,
"parents": [b"123", b"456"],
}
)
@@ -234,12 +280,14 @@
"date": None,
"date_offset": None,
"date_neg_utc_offset": None,
+ "date_offset_bytes": None,
"name": b"release-name",
"comment": b"release comment",
"synthetic": True,
"author_fullname": b"auth-fullname",
"author_name": b"auth-name",
"author_email": b"auth-email",
+ "raw_manifest": None,
}
)

File Metadata

Mime Type
text/plain
Expires
Tue, Jun 3, 7:21 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217174

Event Timeline