Page MenuHomeSoftware Heritage
Paste P286

mercurial loading release issue - Cleanup release/snapshot/origin_visits procedure
ActivePublic

Authored by ardumont on Jul 25 2018, 6:23 PM.
begin;
-- identify release and snapshots to mark them for cleanup
create or replace function mark_wrong_release_and_snapshot_for_cleanup()
returns void
language plpgsql
as $$
declare
release_id release.id%type;
snapshot_id snapshot.object_id%type;
branch_id snapshot_branches.branch_id%type;
begin
-- data to clean up
create table temp_to_cleanup_release(
id sha1_git not null
);
create table temp_to_cleanup_snapshot_branches(
snapshot_id bigint not null, branch_id bigint not null
);
for release_id, snapshot_id, branch_id in (
select distinct r.id, s.object_id, sbs.branch_id
from origin o
inner join origin_visit ov on (o.id=ov.origin and o.type='hg')
inner join snapshot s on ov.snapshot_id=s.object_id
inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id
inner join snapshot_branch sb on sbs.branch_id=sb.object_id
inner join release r on (sb.target_type='release' and r.id=sb.target)
)
loop
-- reference the release to clean up later
insert into temp_to_cleanup_release(id)
values (release_id);
-- then referencing raw snapshot targetting release (not enough though)
insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id)
values (snapshot_id, branch_id);
end loop;
-- Note: if a snapshot is mixed with target of type release and
-- other stuff (revision, etc...), it will not be identified by the
-- prior loop. But this needs to as those snapshot identifiers will
-- be dangling (indeed, the next ingestion will create other correct
-- snapshots with other identifiers).
-- To conclude, we need to iterate over the remaining snapshots as
-- well to mark them for cleanup as well
for snapshot_id, branch_id in (
select distinct s.object_id, sbs.branch_id
from origin o
inner join origin_visit ov on (o.id=ov.origin and o.type='hg')
inner join snapshot s on ov.snapshot_id=s.object_id
inner join snapshot_branches sbs on s.object_id=sbs.snapshot_id
inner join snapshot_branch sb on sbs.branch_id=sb.object_id
)
loop
insert into temp_to_cleanup_snapshot_branches(snapshot_id, branch_id)
values (snapshot_id, branch_id); -- this will possibly create
-- duplicates here (not a
-- problem)
end loop;
return;
end;
$$;
-- clean up 'simple' origin data
create or replace function mark_wrong_origin_visit_and_fetch_history_for_cleanup()
returns void
language plpgsql
as $$
declare
origin_id origin.id%type;
begin
create table if not exists temp_to_cleanup_origin_visit(
origin bigint not null,
visit bigint not null
);
create table if not exists temp_to_cleanup_fetch_history(
id bigint not null,
origin bigint not null
);
-- clean up those origins' metadata
for origin_id in (
select id from origin where type='hg'
)
loop
insert into temp_to_cleanup_origin_visit(origin, visit)
select origin, visit from origin_visit where origin=origin_id;
insert into temp_to_cleanup_fetch_history(id, origin)
select id, origin from fetch_history where origin=origin_id;
end loop;
return;
end;
$$;
-- mark wrong data for cleanup, following tables will be populated:
-- - temp_to_cleanup_release
-- - temp_to_cleanup_snapshot_branches
-- - temp_to_cleanup_origin_visit
-- - temp_to_cleanup_fetch_history
create or replace function mark_wrong_data_for_cleanup()
returns void
language plpgsql
as $$
begin
perform mark_wrong_release_and_snapshot_for_cleanup();
perform mark_wrong_origin_visit_and_fetch_history_for_cleanup();
end;
$$;
-- cleanup
create or replace function clean_wrong_data_up()
returns void
language sql volatile
as $$
delete from release where id in (
select distinct id
from temp_to_cleanup_release
);
delete from snapshot_branches sbs
using temp_to_cleanup_snapshot_branches t
where sbs.branch_id=t.branch_id and sbs.snapshot_id=t.snapshot_id;
-- this could fail if snapshot is also referencing other releases
delete from snapshot_branch where object_id in (
select distinct branch_id
from temp_to_cleanup_snapshot_branches
);
delete from origin_visit where snapshot_id in (
select distinct snapshot_id
from temp_to_cleanup_snapshot_branches
);
delete from snapshot where object_id in (
select distinct snapshot_id
from temp_to_cleanup_snapshot_branches
);
delete from origin_visit ov
using temp_to_cleanup_origin_visit t
where ov.origin=t.origin and ov.visit = t.origin;
delete from fetch_history fh
using temp_to_cleanup_fetch_history t
where fh.id=t.id and fh.origin=t.origin;
-- do not drop the tables just yet, have a look that the
-- data have been deleted alright.
$$;
begin;
-- effective cleanup marking starts here (create table, populate them)
select * from mark_wrong_data_for_cleanup();
-- effective cleanup starts here (does not drop table too)
select * from clean_wrong_data_up();
-- post-analysis after manual cleanup
-- if all is good
-- we can clean up
drop function mark_wrong_data_for_cleanup();
drop function mark_wrong_origin_visit_and_fetch_history_for_cleanup();
drop function mark_wrong_release_and_snapshot_for_cleanup();
drop function clean_wrong_data_up();
-- drop the tables
drop table temp_to_cleanup_release;
drop table temp_to_cleanup_snapshot_branches;
drop table temp_to_cleanup_origin_visit;
drop table temp_to_cleanup_fetch_history;
commit;
--rollback;

Event Timeline

ardumont changed the title of this paste from Cleanup wrong data from mercurial loading to Cleanup wrong data from mercurial loading - wip.
ardumont changed the title of this paste from Cleanup wrong data from mercurial loading - wip to mercurial loading release issue - Cleanup release/snapshot/origin_visits procedure.Jul 26 2018, 12:02 PM
ardumont edited the content of this paste. (Show Details)

Last edit to write all data to "temporary" table prior to actually cleanup (it was only done for release and snapshot so far. It's also done for origin_visit and fetch_history now).

Only the "mark data to cleanup" step has been run on prado so far [1]. In effect, the tables temp_to_cleanup_* are populated with the cleanup data now.

[1] https://forge.softwareheritage.org/P286$163

Spot checks:

temp_to_cleanup_origin_visit/temp_to_cleanup_fetch_history:

Check that all origins come from google code.

select distinct substring(url for 24)
  from origin 
  where exists (
    select 1 from temp_to_cleanup_origin_visit
    where origin.id = temp_to_cleanup_origin_visit.origin
  ) and substring(url from length(url) - 18) != '.googlecode.com/hg/';

returns only https://code.google.com/.

temp_to_cleanup_release:

Check all releases have the shape of bogus mercurial releases:

with to_check as (
  select distinct(id) from temp_to_cleanup_release
) 
select distinct author, target_type, date
  from release
  where exists (
    select 1 from to_check where to_check.id = release.id
  );

returns

  author  | target_type | date | comment | synthetic
----------+-------------+------+---------+-----------
 13868695 | revision    |      |         | f

Check all releases point to nowhere


with to_check as (
  select distinct id, target
    from temp_to_cleanup_release
    inner join release using (id)
)
select id
  from revision
  where exists (
    select 1 from to_check where to_check.target = revision.id
  );

returns 0 rows.

temp_to_cleanup_snapshot_branches:

Check all origin_visits to be removed point to one of these snapshots.


with to_check as (
  select distinct snapshot_id
    from temp_to_cleanup_origin_visit
    inner join origin_visit using (origin, visit)
)
select distinct snapshot_id
  from temp_to_cleanup_snapshot_branches t
  where not exists (
    select 1 from to_check
    where to_check.snapshot_id = t.snapshot_id
  );

returns 0 rows.

Vice versa, check all snapshots are referenced by only these origin_visits.

with to_check as (
  select distinct snapshot_id
    from temp_to_cleanup_snapshot_branches
) select distinct origin, visit
  from origin_visit
  where exists (
    select 1 from to_check
    where to_check.snapshot_id = origin_visit.snapshot_id
  ) and not exists (
    select 1 from temp_to_cleanup_origin_visit t
    where t.origin = origin_visit.origin and t.visit = origin_visit.visit
  );

returns 0 rows.

Check all branches marked as to be removed are not referenced in any other snapshots

select count(distinct branch_id)
  from snapshot_branches
  where exists (
    select 1 from temp_to_cleanup_snapshot_branches t
    where t.branch_id = snapshot_branches.branch_id
  ) and not exists (
    select 1 from temp_to_cleanup_snapshot_branches t
    where t.snapshot_id = snapshot_branches.snapshot_id
  );

returns 0.

I think we're safe for removal of these objects.