#+title: Check reader git's data
* Update swh.loader.git.reader to list sha1s from origin
https://forge.softwareheritage.org/rDLDGae4606dbb59b0c588e81191f6356c4cba12c64e3
* Choose randomly 4 origins
From uffizi:
#+BEGIN_SRC sh
ardumont@uffizi:~% pigz -dc /srv/storage/space/lists/1000-stars-repositories.txt.gz | sort -R | head -4
https://github.com/Bilibili/ijkplayer 8873
https://github.com/jcjohnson/neural-style 11294
https://github.com/zurb/foundation-sites 24330
https://github.com/babel/babel 18273
#+END_SRC
* List their contents
From worker01 (python3-swh.loader.git version 0.22):
#+BEGIN_SRC sh
ardumont@worker01:~% for u in https://github.com/Bilibili/ijkplayer https://github.com/jcjohnson/neural-style https://github.com/zurb/foundation-sites https://github.com/babel/babel; do python3 -m swh.loader.git.reader --origin-url $u > $(basename $u); done
2016-11-04 10:02:07,700 6184 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org
2016-11-04 10:02:10,728 6184 "POST /origin/get HTTP/1.1" 200 78
Counting objects: 25946, done.
Compressing objects: 100% (108/108), done.
Total 25946 (delta 64), reused 0 (delta 0), pack-reused 25837
2016-11-04 10:02:19,675 6184 Listed 162 refs for repo https://github.com/Bilibili/ijkplayer
2016-11-04 10:02:46,510 6191 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org
2016-11-04 10:02:48,079 6191 "POST /origin/get HTTP/1.1" 200 82
Counting objects: 594, done.
Total 594 (delta 0), reused 0 (delta 0), pack-reused 594
2016-11-04 10:02:51,023 6191 Listed 38 refs for repo https://github.com/jcjohnson/neural-style
2016-11-04 10:02:53,049 6196 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org
2016-11-04 10:02:54,607 6196 "POST /origin/get HTTP/1.1" 200 81
Counting objects: 95241, done.
Compressing objects: 100% (246/246), done.
Total 95241 (delta 158), reused 26 (delta 26), pack-reused 94959
2016-11-04 10:03:03,456 6196 Listed 2978 refs for repo https://github.com/zurb/foundation-sites
2016-11-04 14:07:21,040 3513 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org
2016-11-04 14:07:21,124 3513 "POST /origin/get HTTP/1.1" 200 70
Counting objects: 100340, done.
Compressing objects: 100% (101/101), done.
Total 100340 (delta 39), reused 1 (delta 1), pack-reused 100237
2016-11-04 14:07:25,070 3513 Listed 1877 refs for repo https://github.com/babel/babel
#+END_SRC
Which gives:
#+BEGIN_SRC sh
ardumont@worker01:~/analysis/reader% for f in *; do wc -l $f; done
29390 foundation-sites
8069 ijkplayer
215 neural-style
34665 babel
#+END_SRC
* Check their list of contents roughly matches what we have in swh storage
#+BEGIN_SRC sql
create or replace function swh_all_contents_from_origin(origin_url text, origin_type text)
returns setof sha1
language plpgsql
as $$
declare
origin_id bigint;
latest_revs bytea[];
revision sha1_git;
content sha1;
begin
-- Retrieve origin id
select id from origin where url=origin_url and type=origin_type limit 1 into origin_id;
-- Retrieve latest revisions seen for that origin and last visit
select array(select distinct rev.id::bytea
from revision rev
inner join occurrence occ on (rev.id = occ.target and occ.target_type='revision')
where occ.origin=origin_id)
into latest_revs;
-- For each revision, retrieve all its history
for revision in select cro.revision from cache_revision_origin cro where origin=origin_id
loop
-- For each revision for that origin, retrieve the contents
for content in select sha1 from swh_revision_walk(revision) where type='file'
loop
-- And send them as result
return next content;
end loop;
end loop;
return;
end
$$;
#+END_SRC
Check the number of known contents in swh:
#+BEGIN_SRC sh
\copy (select distinct * from swh_all_contents_from_origin('https://github.com/jcjohnson/neural-style', 'git')) to neural-style;
COPY 214 -- OK
\copy (select distinct * from swh_all_contents_from_origin('https://github.com/Bilibili/ijkplayer', 'git')) to ijkplayer;
\COPY 7967 -- OK
\copy (select distinct * from swh_all_contents_from_origin('https://github.com/zurb/foundation-sites', 'git')) to foundation-sites;
COPY 0 -- KO - not populated yet in cache_revision_origin so this one origin does not help (we use the cache in the function drafted above)
\copy (select distinct * from swh_all_contents_from_origin('https://github.com/babel/babel, 'git')) to babel;
...
#+END_SRC
... means currently running