begin; -- identify release and snapshots to mark them for cleanup create or replace function mark_wrong_release_and_snapshot_for_cleanup() returns void language plpgsql as $$ declare release_id release.id%type; snapshot_id snapshot.object_id%type; branch_id snapshot_branches.branch_id%type; begin -- data to clean up create table temp_to_cleanup_release( id sha1_git not null ); create table temp_to_cleanup_snapshot_branches( snapshot_id bigint not null, branch_id bigint not null ); for release_id, snapshot_id, branch_id in ( select distinct r.id, s.object_id, sbs.branch_id from origin o inner join origin_visit ov on (o.id=ov.origin and o.type='hg') inner join snapshot s on ov.snapshot_id=s.object_id inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id inner join snapshot_branch sb on sbs.branch_id=sb.object_id inner join release r on (sb.target_type='release' and r.id=sb.target) ) loop -- reference the release to clean up later insert into temp_to_cleanup_release(id) values (release_id); -- then referencing raw snapshot targetting release (not enough though) insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id) values (snapshot_id, branch_id); end loop; -- Note: if a snapshot is mixed with target of type release and -- other stuff (revision, etc...), it will not be identified by the -- prior loop. But this needs to as those snapshot identifiers will -- be dangling (indeed, the next ingestion will create other correct -- snapshots with other identifiers). -- To conclude, we need to iterate over the remaining snapshots as -- well to mark them for cleanup as well for snapshot_id, branch_id in ( select distinct s.object_id, sbs.branch_id from origin o inner join origin_visit ov on (o.id=ov.origin and o.type='hg') inner join snapshot s on ov.snapshot_id=s.object_id inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id inner join snapshot_branch sb on sbs.branch_id=sb.object_id ) loop insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id) values (snapshot_id, branch_id); -- this will possibly create -- duplicates here (not a -- problem) end loop; return; end; $$; -- clean up 'simple' origin data create or replace function mark_wrong_origin_visit_and_fetch_history_for_cleanup() returns void language plpgsql as $$ declare origin_id origin.id%type; begin create table if not exists temp_to_cleanup_origin_visit( origin bigint not null, visit bigint not null ); create table if not exists temp_to_cleanup_fetch_history( id bigint not null, origin bigint not null ); -- clean up those origins' metadata for origin_id in ( select id from origin where type='hg' ) loop insert into temp_to_cleanup_origin_visit(origin, visit) select origin, visit from origin_visit where origin=origin_id; insert into temp_to_cleanup_fetch_history(id, origin) select id, origin from fetch_history where origin=origin_id; end loop; return; end; $$; -- mark wrong data for cleanup, following tables will be populated: -- - temp_to_cleanup_release -- - temp_to_cleanup_snapshot_branches -- - temp_to_cleanup_origin_visit -- - temp_to_cleanup_fetch_history create or replace function mark_wrong_data_for_cleanup() returns void language plpgsql as $$ begin perform mark_wrong_release_and_snapshot_for_cleanup(); perform mark_wrong_origin_visit_and_fetch_history_for_cleanup(); end; $$; -- cleanup create or replace function clean_wrong_data_up() returns void language sql volatile as $$ delete from release where id in ( select distinct id from temp_to_cleanup_release ); delete from snapshot_branches sbs using temp_to_cleanup_snapshot_branches t where sbs.branch_id=t.branch_id and sbs.snapshot_id=t.snapshot_id; -- this could fail if snapshot is also referencing other releases delete from snapshot_branch where object_id in ( select distinct branch_id from temp_to_cleanup_snapshot_branches ); delete from origin_visit where snapshot_id in ( select distinct snapshot_id from temp_to_cleanup_snapshot_branches ); delete from snapshot where object_id in ( select distinct snapshot_id from temp_to_cleanup_snapshot_branches ); delete from origin_visit ov using temp_to_cleanup_origin_visit t where ov.origin=t.origin and ov.visit = t.origin; delete from fetch_history fh using temp_to_cleanup_fetch_history t where fh.id=t.id and fh.origin=t.origin; -- do not drop the tables just yet, have a look that the -- data have been deleted alright. $$; begin; -- effective cleanup marking starts here (create table, populate them) select * from mark_wrong_data_for_cleanup(); -- effective cleanup starts here (does not drop table too) select * from clean_wrong_data_up(); -- post-analysis after manual cleanup -- if all is good -- we can clean up drop function mark_wrong_data_for_cleanup(); drop function mark_wrong_origin_visit_and_fetch_history_for_cleanup(); drop function mark_wrong_release_and_snapshot_for_cleanup(); drop function clean_wrong_data_up(); -- drop the tables drop table temp_to_cleanup_release; drop table temp_to_cleanup_snapshot_branches; drop table temp_to_cleanup_origin_visit; drop table temp_to_cleanup_fetch_history; commit; --rollback;