#+title: Check reader git's data * Update swh.loader.git.reader to list sha1s from origin https://forge.softwareheritage.org/rDLDGae4606dbb59b0c588e81191f6356c4cba12c64e3 * Choose randomly 4 origins From uffizi: #+BEGIN_SRC sh ardumont@uffizi:~% pigz -dc /srv/storage/space/lists/1000-stars-repositories.txt.gz | sort -R | head -4 https://github.com/Bilibili/ijkplayer 8873 https://github.com/jcjohnson/neural-style 11294 https://github.com/zurb/foundation-sites 24330 https://github.com/babel/babel 18273 #+END_SRC Note: second column is the number of stars * List their contents From worker01 (python3-swh.loader.git version 0.22): #+BEGIN_SRC sh ardumont@worker01:~% for u in https://github.com/Bilibili/ijkplayer https://github.com/jcjohnson/neural-style https://github.com/zurb/foundation-sites https://github.com/babel/babel; do python3 -m swh.loader.git.reader --origin-url $u > $(basename $u); done 2016-11-04 10:02:07,700 6184 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org 2016-11-04 10:02:10,728 6184 "POST /origin/get HTTP/1.1" 200 78 Counting objects: 25946, done. Compressing objects: 100% (108/108), done. Total 25946 (delta 64), reused 0 (delta 0), pack-reused 25837 2016-11-04 10:02:19,675 6184 Listed 162 refs for repo https://github.com/Bilibili/ijkplayer 2016-11-04 10:02:46,510 6191 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org 2016-11-04 10:02:48,079 6191 "POST /origin/get HTTP/1.1" 200 82 Counting objects: 594, done. Total 594 (delta 0), reused 0 (delta 0), pack-reused 594 2016-11-04 10:02:51,023 6191 Listed 38 refs for repo https://github.com/jcjohnson/neural-style 2016-11-04 10:02:53,049 6196 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org 2016-11-04 10:02:54,607 6196 "POST /origin/get HTTP/1.1" 200 81 Counting objects: 95241, done. Compressing objects: 100% (246/246), done. Total 95241 (delta 158), reused 26 (delta 26), pack-reused 94959 2016-11-04 10:03:03,456 6196 Listed 2978 refs for repo https://github.com/zurb/foundation-sites 2016-11-04 14:07:21,040 3513 Starting new HTTP connection (1): uffizi.internal.softwareheritage.org 2016-11-04 14:07:21,124 3513 "POST /origin/get HTTP/1.1" 200 70 Counting objects: 100340, done. Compressing objects: 100% (101/101), done. Total 100340 (delta 39), reused 1 (delta 1), pack-reused 100237 2016-11-04 14:07:25,070 3513 Listed 1877 refs for repo https://github.com/babel/babel #+END_SRC Which gives: #+BEGIN_SRC sh ardumont@worker01:~/analysis/reader% for f in *; do wc -l $f; done 29390 foundation-sites 8069 ijkplayer 215 neural-style 34665 babel #+END_SRC * Check their list of contents roughly matches what we have in swh storage #+BEGIN_SRC sql create or replace function swh_all_contents_from_origin(origin_url text, origin_type text) returns setof sha1 language plpgsql as $$ declare origin_id bigint; latest_revs bytea[]; revision sha1_git; content sha1; begin -- Retrieve origin id select id from origin where url=origin_url and type=origin_type limit 1 into origin_id; -- Retrieve latest revisions seen for that origin and last visit select array(select distinct rev.id::bytea from revision rev inner join occurrence occ on (rev.id = occ.target and occ.target_type='revision') where occ.origin=origin_id) into latest_revs; -- For each revision, retrieve all its history for revision in select cro.revision from cache_revision_origin cro where origin=origin_id loop -- For each revision for that origin, retrieve the contents for content in select sha1 from swh_revision_walk(revision) where type='file' loop -- And send them as result return next content; end loop; end loop; return; end $$; #+END_SRC Check the number of known contents in swh: #+BEGIN_SRC sh \copy (select distinct * from swh_all_contents_from_origin('https://github.com/jcjohnson/neural-style', 'git')) to neural-style; COPY 214 -- OK \copy (select distinct * from swh_all_contents_from_origin('https://github.com/Bilibili/ijkplayer', 'git')) to ijkplayer; COPY 7967 -- OK \copy (select distinct * from swh_all_contents_from_origin('https://github.com/zurb/foundation-sites', 'git')) to foundation-sites; COPY 0 -- KO - not populated yet in cache_revision_origin so this one origin does not help (we use the cache in the function drafted above) \copy (select distinct * from swh_all_contents_from_origin('https://github.com/babel/babel, 'git')) to babel; COPY 33882 -- OK #+END_SRC