diff --git a/sql/upgrades/156.sql b/sql/upgrades/156.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/156.sql @@ -0,0 +1,25 @@ +-- SWH DB schema upgrade +-- from_version: 155 +-- to_version: 156 +-- description: Make swh_release_add properly idempotent + +-- latest schema version +insert into dbversion(version, release, description) + values(156, now(), 'Work In Progress'); + +-- Create entries in release from tmp_release +create or replace function swh_release_add() + returns void + language plpgsql +as $$ +begin + perform swh_person_add_from_release(); + + insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic) + select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic + from tmp_release t + left join person a on a.fullname = t.author_fullname + where not exists (select 1 from release where t.id = release.id); + return; +end +$$; diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -512,16 +512,20 @@ return self._cql_runner.revision_get_random().id def release_add(self, releases: Iterable[Release]) -> Dict: - missing = self.release_missing([rel.id for rel in releases]) - releases = [rel for rel in releases if rel.id in missing] + to_add = [] + for rel in releases: + if rel not in to_add: + to_add.append(rel) + missing = set(self.release_missing([rel.id for rel in to_add])) + to_add = [rel for rel in to_add if rel.id in missing] - self.journal_writer.release_add(releases) + self.journal_writer.release_add(to_add) - for release in releases: + for release in to_add: if release: self._cql_runner.release_add_one(release_to_db(release)) - return {"release:add": len(missing)} + return {"release:add": len(to_add)} def release_missing(self, releases): return self._cql_runner.release_missing(releases) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -523,18 +523,19 @@ return random.choice(list(self._revisions)) def release_add(self, releases: Iterable[Release]) -> Dict: - releases = [rel for rel in releases if rel.id not in self._releases] - self.journal_writer.release_add(releases) - - count = 0 + to_add = [] for rel in releases: + if rel.id not in self._releases and rel not in to_add: + to_add.append(rel) + self.journal_writer.release_add(to_add) + + for rel in to_add: if rel.author: self._person_add(rel.author) self._objects[rel.id].append(("release", rel.id)) self._releases[rel.id] = rel - count += 1 - return {"release:add": count} + return {"release:add": len(to_add)} def release_missing(self, releases): yield from (rel for rel in releases if rel not in self._releases) diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(155, now(), 'Work In Progress'); + values(156, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -568,9 +568,10 @@ perform swh_person_add_from_release(); insert into release (id, target, target_type, date, date_offset, date_neg_utc_offset, name, comment, author, synthetic) - select t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic - from tmp_release t - left join person a on a.fullname = t.author_fullname; + select distinct t.id, t.target, t.target_type, t.date, t.date_offset, t.date_neg_utc_offset, t.name, t.comment, a.id, t.synthetic + from tmp_release t + left join person a on a.fullname = t.author_fullname + where not exists (select 1 from release where t.id = release.id); return; end $$; diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1227,13 +1227,17 @@ ("release", Release.from_dict(data.release)) ] - actual_result = swh_storage.release_add([data.release, data.release2]) + actual_result = swh_storage.release_add( + [data.release, data.release2, data.release, data.release2] + ) assert actual_result == {"release:add": 1} - assert list(swh_storage.journal_writer.journal.objects) == [ - ("release", Release.from_dict(data.release)), - ("release", Release.from_dict(data.release2)), - ] + assert set(swh_storage.journal_writer.journal.objects) == set( + [ + ("release", Release.from_dict(data.release)), + ("release", Release.from_dict(data.release2)), + ] + ) def test_release_add_name_clash(self, swh_storage): release1 = data.release.copy()