Page MenuHomeSoftware Heritage
Paste P286

mercurial loading release issue - Cleanup release/snapshot/origin_visits procedure
ActivePublic

Authored by ardumont on Jul 25 2018, 6:23 PM.
begin;
-- identify release and snapshots to mark them for cleanup
create or replace function mark_wrong_release_and_snapshot_for_cleanup()
returns void
language plpgsql
as $$
declare
release_id release.id%type;
snapshot_id snapshot.object_id%type;
branch_id snapshot_branches.branch_id%type;
begin
-- data to clean up
create table temp_to_cleanup_release(
id sha1_git not null
);
create table temp_to_cleanup_snapshot_branches(
snapshot_id bigint not null, branch_id bigint not null
);
for release_id, snapshot_id, branch_id in (
select distinct r.id, s.object_id, sbs.branch_id
from origin o
inner join origin_visit ov on (o.id=ov.origin and o.type='hg')
inner join snapshot s on ov.snapshot_id=s.object_id
inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id
inner join snapshot_branch sb on sbs.branch_id=sb.object_id
inner join release r on (sb.target_type='release' and r.id=sb.target)
)
loop
-- reference the release to clean up later
insert into temp_to_cleanup_release(id)
values (release_id);
-- then referencing raw snapshot targetting release (not enough though)
insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id)
values (snapshot_id, branch_id);
end loop;
-- Note: if a snapshot is mixed with target of type release and
-- other stuff (revision, etc...), it will not be identified by the
-- prior loop. But this needs to as those snapshot identifiers will
-- be dangling (indeed, the next ingestion will create other correct
-- snapshots with other identifiers).
-- To conclude, we need to iterate over the remaining snapshots as
-- well to mark them for cleanup as well
for snapshot_id, branch_id in (
select distinct s.object_id, sbs.branch_id
from origin o
inner join origin_visit ov on (o.id=ov.origin and o.type='hg')
inner join snapshot s on ov.snapshot_id=s.object_id
inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id
inner join snapshot_branch sb on sbs.branch_id=sb.object_id
)
loop
insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id)
values (snapshot_id, branch_id); -- this will possibly create
-- duplicates here (not a
-- problem)
end loop;
return;
end;
$$;
-- clean up 'simple' origin data
create or replace function mark_wrong_origin_visit_and_fetch_history_for_cleanup()
returns void
language plpgsql
as $$
declare
origin_id origin.id%type;
begin
create table if not exists temp_to_cleanup_origin_visit(
origin bigint not null,
visit bigint not null
);
create table if not exists temp_to_cleanup_fetch_history(
id bigint not null,
origin bigint not null
);
-- clean up those origins' metadata
for origin_id in (
select id from origin where type='hg'
)
loop
insert into temp_to_cleanup_origin_visit(origin, visit)
select origin, visit from origin_visit where origin=origin_id;
insert into temp_to_cleanup_fetch_history(id, origin)
select id, origin from fetch_history where origin=origin_id;
end loop;
return;
end;
$$;
-- mark wrong data for cleanup, following tables will be populated:
-- - temp_to_cleanup_release
-- - temp_to_cleanup_snapshot_branches
-- - temp_to_cleanup_origin_visit
-- - temp_to_cleanup_fetch_history
create or replace function mark_wrong_data_for_cleanup()
returns void
language plpgsql
as $$
begin
perform mark_wrong_release_and_snapshot_for_cleanup();
perform mark_wrong_origin_visit_and_fetch_history_for_cleanup();
end;
$$;
-- cleanup
create or replace function clean_wrong_data_up()
returns void
language sql volatile
as $$
delete from release where id in (
select distinct id
from temp_to_cleanup_release
);
delete from snapshot_branches sbs
using temp_to_cleanup_snapshot_branches t
where sbs.branch_id=t.branch_id and sbs.snapshot_id=t.snapshot_id;
-- this could fail if snapshot is also referencing other releases
delete from snapshot_branch where object_id in (
select distinct branch_id
from temp_to_cleanup_snapshot_branches
);
delete from origin_visit where snapshot_id in (
select distinct snapshot_id
from temp_to_cleanup_snapshot_branches
);
delete from snapshot where object_id in (
select distinct snapshot_id
from temp_to_cleanup_snapshot_branches
);
delete from origin_visit ov
using temp_to_cleanup_origin_visit t
where ov.origin=t.origin and ov.visit = t.origin;
delete from fetch_history fh
using temp_to_cleanup_fetch_history t
where fh.id=t.id and fh.origin=t.origin;
-- do not drop the tables just yet, have a look that the
-- data have been deleted alright.
$$;
begin;
-- effective cleanup marking starts here (create table, populate them)
select * from mark_wrong_data_for_cleanup();
-- effective cleanup starts here (does not drop table too)
select * from clean_wrong_data_up();
-- post-analysis after manual cleanup
-- if all is good
-- we can clean up
drop function mark_wrong_data_for_cleanup();
drop function mark_wrong_origin_visit_and_fetch_history_for_cleanup();
drop function mark_wrong_release_and_snapshot_for_cleanup();
drop function clean_wrong_data_up();
-- drop the tables
drop table temp_to_cleanup_release;
drop table temp_to_cleanup_snapshot_branches;
drop table temp_to_cleanup_origin_visit;
drop table temp_to_cleanup_fetch_history;
commit;
--rollback;

Event Timeline

ardumont created this paste.Jul 25 2018, 6:23 PM
ardumont changed the title of this paste from Cleanup wrong data from mercurial loading to Cleanup wrong data from mercurial loading - wip.
ardumont edited the content of this paste. (Show Details)Jul 26 2018, 11:57 AM
ardumont changed the title of this paste from Cleanup wrong data from mercurial loading - wip to mercurial loading release issue - Cleanup release/snapshot/origin_visits procedure.Jul 26 2018, 12:02 PM
ardumont edited the content of this paste. (Show Details)
ardumont edited the content of this paste. (Show Details)Jul 26 2018, 1:32 PM
ardumont edited the content of this paste. (Show Details)Jul 26 2018, 2:18 PM
ardumont edited the content of this paste. (Show Details)Jul 26 2018, 2:36 PM

Last edit [1] to update code comments.

[1] https://forge.softwareheritage.org/P286#1923

ardumont edited the content of this paste. (Show Details)Aug 22 2018, 3:41 PM

Last edit to write all data to "temporary" table prior to actually cleanup (it was only done for release and snapshot so far. It's also done for origin_visit and fetch_history now).

Only the "mark data to cleanup" step has been run on prado so far [1]. In effect, the tables temp_to_cleanup_* are populated with the cleanup data now.

[1] https://forge.softwareheritage.org/P286$163

ardumont edited the content of this paste. (Show Details)Aug 24 2018, 11:03 AM
olasd added a subscriber: olasd.Sep 12 2018, 5:01 PM

Spot checks:

temp_to_cleanup_origin_visit/temp_to_cleanup_fetch_history:

Check that all origins come from google code.

select distinct substring(url for 24)
  from origin 
  where exists (
    select 1 from temp_to_cleanup_origin_visit
    where origin.id = temp_to_cleanup_origin_visit.origin
  ) and substring(url from length(url) - 18) != '.googlecode.com/hg/';

returns only https://code.google.com/.

temp_to_cleanup_release:

Check all releases have the shape of bogus mercurial releases:

with to_check as (
  select distinct(id) from temp_to_cleanup_release
) 
select distinct author, target_type, date
  from release
  where exists (
    select 1 from to_check where to_check.id = release.id
  );

returns

  author  | target_type | date | comment | synthetic
----------+-------------+------+---------+-----------
 13868695 | revision    |      |         | f

Check all releases point to nowhere


with to_check as (
  select distinct id, target
    from temp_to_cleanup_release
    inner join release using (id)
)
select id
  from revision
  where exists (
    select 1 from to_check where to_check.target = revision.id
  );

returns 0 rows.

temp_to_cleanup_snapshot_branches:

Check all origin_visits to be removed point to one of these snapshots.


with to_check as (
  select distinct snapshot_id
    from temp_to_cleanup_origin_visit
    inner join origin_visit using (origin, visit)
)
select distinct snapshot_id
  from temp_to_cleanup_snapshot_branches t
  where not exists (
    select 1 from to_check
    where to_check.snapshot_id = t.snapshot_id
  );

returns 0 rows.

Vice versa, check all snapshots are referenced by only these origin_visits.

with to_check as (
  select distinct snapshot_id
    from temp_to_cleanup_snapshot_branches
) select distinct origin, visit
  from origin_visit
  where exists (
    select 1 from to_check
    where to_check.snapshot_id = origin_visit.snapshot_id
  ) and not exists (
    select 1 from temp_to_cleanup_origin_visit t
    where t.origin = origin_visit.origin and t.visit = origin_visit.visit
  );

returns 0 rows.

Check all branches marked as to be removed are not referenced in any other snapshots

select count(distinct branch_id)
  from snapshot_branches
  where exists (
    select 1 from temp_to_cleanup_snapshot_branches t
    where t.branch_id = snapshot_branches.branch_id
  ) and not exists (
    select 1 from temp_to_cleanup_snapshot_branches t
    where t.snapshot_id = snapshot_branches.snapshot_id
  );

returns 0.

I think we're safe for removal of these objects.