Page MenuHomeSoftware Heritage
Paste P117

swh mimetype recognized repartition - updated
ActivePublic

Authored by ardumont on Oct 21 2016, 11:36 AM.
softwareheritage=> select mimetype, count, percent from swh_content_mimetype_text_repartition();
mimetype | count | percent
------------------------------------+---------+-----------
text/plain | 2764656 | 37.3915
text/x-c | 1603002 | 21.6803
text/x-po | 593128 | 8.0220
text/html | 464516 | 6.2825
text/x-c++ | 398550 | 5.3903
application/xml | 379255 | 5.1294
text/x-php | 317574 | 4.2951
text/x-ruby | 279675 | 3.7826
text/x-python | 223713 | 3.0257
text/x-shellscript | 33402 | .4518
text/x-makefile | 20722 | .2803
text/x-asm | 11816 | .1598
image/svg+xml | 9961 | .1347
text/x-diff | 8486 | .1148
text/x-pascal | 7876 | .1065
text/troff | 7285 | .0985
text/x-lisp | 5694 | .0770
text/x-m4 | 4804 | .0650
text/x-tex | 4388 | .0593
application/javascript | 2811 | .0380
text/x-msdos-batch | 2380 | .0322
text/x-fortran | 2125 | .0287
text/x-perl | 2001 | .0271
application/x-elc | 531 | .0072
application/zlib | 518 | .0070
message/rfc822 | 259 | .0035
application/postscript | 227 | .0031
image/x-xpmi | 142 | .0019
text/rtf | 128 | .0017
application/vnd.ms-fontobject | 95 | .0013
text/x-lua | 94 | .0013
text/x-awk | 89 | .0012
application/pdf | 81 | .0011
application/x-setupscript | 57 | .0008
application/pgp-keys | 44 | .0006
message/news | 42 | .0006
text/x-tcl | 35 | .0005
image/x-portable-pixmap | 31 | .0004
application/pgp-signature | 29 | .0004
text/x-vcard | 21 | .0003
video/quicktime | 10 | .0001
application/mac-binhex40 | 5 | .0001
image/x-portable-greymap | 5 | .0001
application/vnd.font-fontforge-sfd | 4 | .0001
image/gif | 3 | .0000
application/x-wine-extension-ini | 3 | .0000
application/x-bzip2 | 3 | .0000
application/x-freemind | 3 | .0000
image/x-portable-bitmap | 2 | .0000
model/vrml | 2 | .0000
application/vnd.fdf | 1 | .0000
application/x-archive | 1 | .0000
application/vnd.ms-opentype | 1 | .0000
application/pgp | 1 | .0000
application/x-kdelnk | 1 | .0000
audio/mpeg | 1 | .0000
text/x-nawk | 1 | .0000
(57 rows)
softwareheritage=> select (100 * (select count(*) from content_mimetype where encoding <> 'binary')) / (select count(*) from content_mimetype)::real as percent;
percent
------------------
96.7064603246392
(1 row)
softwareheritage=> select count(*) from content_mimetype;
count
---------
7393808
(1 row)
softwareheritage=> select count(*) from content_mimetype where encoding <> 'binary';
count
---------
7150290
(1 row)
softwareheritage=> select count(*) from content_language;
count
---------
7017866
(1 row)
# sql
create type swh_content_mimetype_text_repartition_signature as (
mimetype text,
count bigint,
percent text
);
create or replace function swh_content_mimetype_text_repartition()
returns setof swh_content_mimetype_text_repartition_signature
language plpgsql
as $$
declare
total real;
begin
select count(id) from content_mimetype into total;
return query
select convert_from(mimetype, 'utf-8'), count(id) as count, to_char((100.0 * count(id) / total), '999.9999') as percent
from content_mimetype
where encoding <> 'binary'
group by mimetype
order by count desc;
end
$$;

Event Timeline

ardumont updated the paste's language from autodetect to sql.Oct 21 2016, 11:38 AM
ardumont changed the title of this paste from swh mimetype recognized repartition to swh mimetype recognized repartition - updated.Oct 24 2016, 11:36 AM
ardumont edited the content of this paste. (Show Details)