softwareheritage=> select mimetype, count, percent from swh_content_mimetype_text_repartition(); mimetype | count | percent ------------------------------------+---------+----------- text/plain | 2764656 | 37.3915 text/x-c | 1603002 | 21.6803 text/x-po | 593128 | 8.0220 text/html | 464516 | 6.2825 text/x-c++ | 398550 | 5.3903 application/xml | 379255 | 5.1294 text/x-php | 317574 | 4.2951 text/x-ruby | 279675 | 3.7826 text/x-python | 223713 | 3.0257 text/x-shellscript | 33402 | .4518 text/x-makefile | 20722 | .2803 text/x-asm | 11816 | .1598 image/svg+xml | 9961 | .1347 text/x-diff | 8486 | .1148 text/x-pascal | 7876 | .1065 text/troff | 7285 | .0985 text/x-lisp | 5694 | .0770 text/x-m4 | 4804 | .0650 text/x-tex | 4388 | .0593 application/javascript | 2811 | .0380 text/x-msdos-batch | 2380 | .0322 text/x-fortran | 2125 | .0287 text/x-perl | 2001 | .0271 application/x-elc | 531 | .0072 application/zlib | 518 | .0070 message/rfc822 | 259 | .0035 application/postscript | 227 | .0031 image/x-xpmi | 142 | .0019 text/rtf | 128 | .0017 application/vnd.ms-fontobject | 95 | .0013 text/x-lua | 94 | .0013 text/x-awk | 89 | .0012 application/pdf | 81 | .0011 application/x-setupscript | 57 | .0008 application/pgp-keys | 44 | .0006 message/news | 42 | .0006 text/x-tcl | 35 | .0005 image/x-portable-pixmap | 31 | .0004 application/pgp-signature | 29 | .0004 text/x-vcard | 21 | .0003 video/quicktime | 10 | .0001 application/mac-binhex40 | 5 | .0001 image/x-portable-greymap | 5 | .0001 application/vnd.font-fontforge-sfd | 4 | .0001 image/gif | 3 | .0000 application/x-wine-extension-ini | 3 | .0000 application/x-bzip2 | 3 | .0000 application/x-freemind | 3 | .0000 image/x-portable-bitmap | 2 | .0000 model/vrml | 2 | .0000 application/vnd.fdf | 1 | .0000 application/x-archive | 1 | .0000 application/vnd.ms-opentype | 1 | .0000 application/pgp | 1 | .0000 application/x-kdelnk | 1 | .0000 audio/mpeg | 1 | .0000 text/x-nawk | 1 | .0000 (57 rows) softwareheritage=> select (100 * (select count(*) from content_mimetype where encoding <> 'binary')) / (select count(*) from content_mimetype)::real as percent; percent ------------------ 96.7064603246392 (1 row) softwareheritage=> select count(*) from content_mimetype; count --------- 7393808 (1 row) softwareheritage=> select count(*) from content_mimetype where encoding <> 'binary'; count --------- 7150290 (1 row) softwareheritage=> select count(*) from content_language; count --------- 7017866 (1 row) # sql create type swh_content_mimetype_text_repartition_signature as ( mimetype text, count bigint, percent text ); create or replace function swh_content_mimetype_text_repartition() returns setof swh_content_mimetype_text_repartition_signature language plpgsql as $$ declare total real; begin select count(id) from content_mimetype into total; return query select convert_from(mimetype, 'utf-8'), count(id) as count, to_char((100.0 * count(id) / total), '999.9999') as percent from content_mimetype where encoding <> 'binary' group by mimetype order by count desc; end $$;