diff --git a/.gitignore b/.gitignore index f5fc2ae..43c4b92 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ *.pyc *.sw? *~ .coverage .eggs/ __pycache__ *.egg-info/ -version.txt \ No newline at end of file +version.txt +/sql/createdb-stamp +/sql/filldb-stamp diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..d52d181 --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,43 @@ +# Depends: postgresql-client, postgresql-autodoc + +DBNAME = softwareheritage-indexer-dev +DOCDIR = autodoc + +SQL_INIT = swh-init.sql +SQL_ENUMS = swh-enums.sql +SQL_SCHEMA = swh-schema.sql +SQL_FUNC = swh-func.sql +SQL_DATA = swh-data.sql +SQL_INDEX = swh-indexes.sql +SQLS = $(SQL_INIT) $(SQL_ENUMS) $(SQL_SCHEMA) $(SQL_FUNC) $(SQL_INDEX) $(SQL_DATA) + +PSQL_BIN = psql +PSQL_FLAGS = --echo-all -X -v ON_ERROR_STOP=1 +PSQL = $(PSQL_BIN) $(PSQL_FLAGS) + +all: + +createdb: createdb-stamp +createdb-stamp: $(SQL_INIT) + createdb $(DBNAME) + touch $@ + +filldb: filldb-stamp +filldb-stamp: createdb-stamp + cat $(SQLS) | $(PSQL) $(DBNAME) + touch $@ + +dropdb: + -dropdb $(DBNAME) + +dumpdb: swh-indexer.dump +swh-indexer.dump: filldb-stamp + pg_dump -Fc $(DBNAME) > $@ + +clean: + rm -rf *-stamp $(DOCDIR)/ + +distclean: clean dropdb + rm -f swh-indexer.dump + +.PHONY: all initdb createdb dropdb doc clean diff --git a/sql/bin/db-upgrade b/sql/bin/db-upgrade new file mode 100755 index 0000000..1dd4e2b --- /dev/null +++ b/sql/bin/db-upgrade @@ -0,0 +1,73 @@ +#!/bin/bash + +# Compute a draft upgrade script for the DB schema, based on Git revisions. + +# Depends: apgdiff + +set -e + +SQLS="swh-*.sql" +VERSION_SQL="swh-schema.sql" +UPGRADE_DIR="upgrades" +DB_NAME="softwareheritage-dev" + +usage () { + echo "Usage: db-upgrade GIT_REV_FROM [GIT_REV_TO]" + echo "Example: db-upgrade HEAD^" + echo " db-upgrade HEAD~4 HEAD~2" + echo "See also: gitrevisions(7)" + exit 1 +} + +pg_dump_revision () { + rev="$1" + dump="$2" + + echo "checking out revision $rev, and dumping DB at the time..." + if [ "$rev" != "HEAD" ] ; then + git checkout --quiet "$rev" + fi + make distclean filldb > /dev/null + pg_dump "$DB_NAME" > "$dump" + if [ "$rev" != "HEAD" ] ; then + git checkout --quiet - + fi +} + +# argument parsing +if [ -z "$1" ] ; then + usage +fi +from_rev="$1" +shift 1 +if [ -z "$1" ] ; then + to_rev="HEAD" +else + to_rev="$1" + shift 1 +fi + +old_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) +new_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) +trap "rm -f $old_dump $new_dump" EXIT + +schema_version=$(grep -i -A 1 '^insert into dbversion' "$VERSION_SQL" | tail -n 1 \ + | sed -e 's/.*values(//i' -e 's/,.*//') +upgrade_script=$(mktemp -p "$UPGRADE_DIR" $(printf '%.03d' ${schema_version}).XXXX.sql) +pg_dump_revision "$from_rev" "$old_dump" +pg_dump_revision "$to_rev" "$new_dump" + +cat > "$upgrade_script" <> "$upgrade_script" + +echo "all done." +echo "Draft upgrade script is at: ${upgrade_script}" diff --git a/sql/bin/dot_add_content b/sql/bin/dot_add_content new file mode 100755 index 0000000..fc24e38 --- /dev/null +++ b/sql/bin/dot_add_content @@ -0,0 +1,15 @@ +#!/bin/bash + +DOT_FILE="$1" +DOT_EXTRA="$2" +if [ -z "$DOT_FILE" -o -z "$DOT_EXTRA" ] ; then + echo "Usage: $0 DOT_FILE DOT_EXTRA" + exit 1 +fi + +schema_version=$(grep -i -A 1 '^insert into dbversion' swh-schema.sql | tail -n 1 \ + | sed -e 's/.*values(//i' -e 's/,.*//') + +head -n -1 "$DOT_FILE" # all of $DOT_FILE but last line +sed "s/@@VERSION@@/$schema_version/" "$DOT_EXTRA" +echo "}" diff --git a/sql/doc/json b/sql/doc/json new file mode 120000 index 0000000..d1d9852 --- /dev/null +++ b/sql/doc/json @@ -0,0 +1 @@ +../json \ No newline at end of file diff --git a/sql/doc/sql b/sql/doc/sql new file mode 120000 index 0000000..8f225da --- /dev/null +++ b/sql/doc/sql @@ -0,0 +1 @@ +../autodoc \ No newline at end of file diff --git a/sql/json/.gitignore b/sql/json/.gitignore new file mode 100644 index 0000000..c337aa9 --- /dev/null +++ b/sql/json/.gitignore @@ -0,0 +1 @@ +*-stamp diff --git a/sql/json/Makefile b/sql/json/Makefile new file mode 100644 index 0000000..5d983b8 --- /dev/null +++ b/sql/json/Makefile @@ -0,0 +1,19 @@ +# Depends: json-glib-tools + +JSONVAL = json-glib-validate +JSONS = $(wildcard *.json) + +all: validate +check: validate +test: validate + +validate: validate-stamp +validate-stamp: $(JSONS) + make $(patsubst %,validate/%,$?) + touch $@ + +validate/%: + $(JSONVAL) $* + +clean: + rm -f validate-stamp diff --git a/sql/json/indexer_configuration.tool_configuration.schema.json b/sql/json/indexer_configuration.tool_configuration.schema.json new file mode 100644 index 0000000..28396b4 --- /dev/null +++ b/sql/json/indexer_configuration.tool_configuration.schema.json @@ -0,0 +1,11 @@ +{ + "$schema": "http://json-schema.org/schema#", + "id": "http://softwareheritage.org/schemas/indexer_configuration.tool_configuration.schema.json", + + "type": "object", + "properties": { + "command_line": { + "type": "string" + } + } +} diff --git a/sql/json/revision_metadata.translated_metadata.json b/sql/json/revision_metadata.translated_metadata.json new file mode 100644 index 0000000..1806fc7 --- /dev/null +++ b/sql/json/revision_metadata.translated_metadata.json @@ -0,0 +1,59 @@ +{ + "$schema": "http://json-schema.org/schema#", + "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json", + + "type": "object", + "properties": { + "developmentStatus": { + "type": "list" + }, + "version": { + "type": "list" + }, + "operatingSystem": { + "type": "list" + }, + "description": { + "type": "list" + }, + "keywords": { + "type": "list" + }, + "issueTracker": { + "type": "list" + }, + "name": { + "type": "list" + }, + "author": { + "type": "list" + }, + "relatedLink": { + "type": "list" + }, + "url": { + "type": "list" + }, + "type": { + "type": "list" + }, + "license": { + "type": "list" + }, + "maintainer": { + "type": "list" + }, + "email": { + "type": "list" + }, + "softwareRequirements": { + "type": "list" + }, + "identifier": { + "type": "list" + }, + "codeRepository": { + "type": "list" + }, + } +} diff --git a/sql/swh-data.sql b/sql/swh-data.sql new file mode 100644 index 0000000..e429343 --- /dev/null +++ b/sql/swh-data.sql @@ -0,0 +1,26 @@ +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('nomos', '3.1.0rc2-31-ga2cbb8c', '{"command_line": "nomossa "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('file', '5.22', '{"command_line": "file --mime "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('universal-ctags', '~git7859817b', '{"command_line": "ctags --fields=+lnz --sort=no --links=no --output-format=json "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-deposit', '0.0.1', '{"sword_version": "2"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('file', '1:5.30-1+deb9u1', '{"type": "library", "debian-package": "python3-magic"}'); diff --git a/sql/swh-enums.sql b/sql/swh-enums.sql new file mode 100644 index 0000000..a357eb5 --- /dev/null +++ b/sql/swh-enums.sql @@ -0,0 +1,100 @@ +create type languages as enum ( 'abap', 'abnf', 'actionscript', + 'actionscript-3', 'ada', 'adl', 'agda', 'alloy', 'ambienttalk', + 'antlr', 'antlr-with-actionscript-target', 'antlr-with-c#-target', + 'antlr-with-cpp-target', 'antlr-with-java-target', + 'antlr-with-objectivec-target', 'antlr-with-perl-target', + 'antlr-with-python-target', 'antlr-with-ruby-target', 'apacheconf', + 'apl', 'applescript', 'arduino', 'aspectj', 'aspx-cs', 'aspx-vb', + 'asymptote', 'autohotkey', 'autoit', 'awk', 'base-makefile', 'bash', + 'bash-session', 'batchfile', 'bbcode', 'bc', 'befunge', + 'blitzbasic', 'blitzmax', 'bnf', 'boo', 'boogie', 'brainfuck', + 'bro', 'bugs', 'c', 'c#', 'c++', 'c-objdump', 'ca65-assembler', + 'cadl', 'camkes', 'cbm-basic-v2', 'ceylon', 'cfengine3', + 'cfstatement', 'chaiscript', 'chapel', 'cheetah', 'cirru', 'clay', + 'clojure', 'clojurescript', 'cmake', 'cobol', 'cobolfree', + 'coffeescript', 'coldfusion-cfc', 'coldfusion-html', 'common-lisp', + 'component-pascal', 'coq', 'cpp-objdump', 'cpsa', 'crmsh', 'croc', + 'cryptol', 'csound-document', 'csound-orchestra', 'csound-score', + 'css', 'css+django/jinja', 'css+genshi-text', 'css+lasso', + 'css+mako', 'css+mozpreproc', 'css+myghty', 'css+php', 'css+ruby', + 'css+smarty', 'cuda', 'cypher', 'cython', 'd', 'd-objdump', + 'darcs-patch', 'dart', 'debian-control-file', 'debian-sourcelist', + 'delphi', 'dg', 'diff', 'django/jinja', 'docker', 'dtd', 'duel', + 'dylan', 'dylan-session', 'dylanlid', 'earl-grey', 'easytrieve', + 'ebnf', 'ec', 'ecl', 'eiffel', 'elixir', 'elixir-iex-session', + 'elm', 'emacslisp', 'embedded-ragel', 'erb', 'erlang', + 'erlang-erl-session', 'evoque', 'ezhil', 'factor', 'fancy', + 'fantom', 'felix', 'fish', 'fortran', 'fortranfixed', 'foxpro', + 'fsharp', 'gap', 'gas', 'genshi', 'genshi-text', 'gettext-catalog', + 'gherkin', 'glsl', 'gnuplot', 'go', 'golo', 'gooddata-cl', 'gosu', + 'gosu-template', 'groff', 'groovy', 'haml', 'handlebars', 'haskell', + 'haxe', 'hexdump', 'html', 'html+cheetah', 'html+django/jinja', + 'html+evoque', 'html+genshi', 'html+handlebars', 'html+lasso', + 'html+mako', 'html+myghty', 'html+php', 'html+smarty', 'html+twig', + 'html+velocity', 'http', 'hxml', 'hy', 'hybris', 'idl', 'idris', + 'igor', 'inform-6', 'inform-6-template', 'inform-7', 'ini', 'io', + 'ioke', 'irc-logs', 'isabelle', 'j', 'jade', 'jags', 'jasmin', + 'java', 'java-server-page', 'javascript', 'javascript+cheetah', + 'javascript+django/jinja', 'javascript+genshi-text', + 'javascript+lasso', 'javascript+mako', 'javascript+mozpreproc', + 'javascript+myghty', 'javascript+php', 'javascript+ruby', + 'javascript+smarty', 'jcl', 'json', 'json-ld', 'julia', + 'julia-console', 'kal', 'kconfig', 'koka', 'kotlin', 'lasso', + 'lean', 'lesscss', 'lighttpd-configuration-file', 'limbo', 'liquid', + 'literate-agda', 'literate-cryptol', 'literate-haskell', + 'literate-idris', 'livescript', 'llvm', 'logos', 'logtalk', 'lsl', + 'lua', 'makefile', 'mako', 'maql', 'mask', 'mason', 'mathematica', + 'matlab', 'matlab-session', 'minid', 'modelica', 'modula-2', + 'moinmoin/trac-wiki-markup', 'monkey', 'moocode', 'moonscript', + 'mozhashpreproc', 'mozpercentpreproc', 'mql', 'mscgen', + 'msdos-session', 'mupad', 'mxml', 'myghty', 'mysql', 'nasm', + 'nemerle', 'nesc', 'newlisp', 'newspeak', + 'nginx-configuration-file', 'nimrod', 'nit', 'nix', 'nsis', 'numpy', + 'objdump', 'objdump-nasm', 'objective-c', 'objective-c++', + 'objective-j', 'ocaml', 'octave', 'odin', 'ooc', 'opa', + 'openedge-abl', 'pacmanconf', 'pan', 'parasail', 'pawn', 'perl', + 'perl6', 'php', 'pig', 'pike', 'pkgconfig', 'pl/pgsql', + 'postgresql-console-(psql)', 'postgresql-sql-dialect', 'postscript', + 'povray', 'powershell', 'powershell-session', 'praat', 'prolog', + 'properties', 'protocol-buffer', 'puppet', 'pypy-log', 'python', + 'python-3', 'python-3.0-traceback', 'python-console-session', + 'python-traceback', 'qbasic', 'qml', 'qvto', 'racket', 'ragel', + 'ragel-in-c-host', 'ragel-in-cpp-host', 'ragel-in-d-host', + 'ragel-in-java-host', 'ragel-in-objective-c-host', + 'ragel-in-ruby-host', 'raw-token-data', 'rconsole', 'rd', 'rebol', + 'red', 'redcode', 'reg', 'resourcebundle', 'restructuredtext', + 'rexx', 'rhtml', 'roboconf-graph', 'roboconf-instances', + 'robotframework', 'rpmspec', 'rql', 'rsl', 'ruby', + 'ruby-irb-session', 'rust', 's', 'sass', 'scala', + 'scalate-server-page', 'scaml', 'scheme', 'scilab', 'scss', 'shen', + 'slim', 'smali', 'smalltalk', 'smarty', 'snobol', 'sourcepawn', + 'sparql', 'sql', 'sqlite3con', 'squidconf', 'stan', 'standard-ml', + 'supercollider', 'swift', 'swig', 'systemverilog', 'tads-3', 'tap', + 'tcl', 'tcsh', 'tcsh-session', 'tea', 'termcap', 'terminfo', + 'terraform', 'tex', 'text-only', 'thrift', 'todotxt', + 'trafficscript', 'treetop', 'turtle', 'twig', 'typescript', + 'urbiscript', 'vala', 'vb.net', 'vctreestatus', 'velocity', + 'verilog', 'vgl', 'vhdl', 'viml', 'x10', 'xml', 'xml+cheetah', + 'xml+django/jinja', 'xml+evoque', 'xml+lasso', 'xml+mako', + 'xml+myghty', 'xml+php', 'xml+ruby', 'xml+smarty', 'xml+velocity', + 'xquery', 'xslt', 'xtend', 'xul+mozpreproc', 'yaml', 'yaml+jinja', + 'zephir', 'unknown' +); +comment on type languages is 'Languages recognized by language indexer'; + +create type ctags_languages as enum ( 'Ada', 'AnsiblePlaybook', 'Ant', + 'Asm', 'Asp', 'Autoconf', 'Automake', 'Awk', 'Basic', 'BETA', 'C', + 'C#', 'C++', 'Clojure', 'Cobol', 'CoffeeScript [disabled]', 'CSS', + 'ctags', 'D', 'DBusIntrospect', 'Diff', 'DosBatch', 'DTS', 'Eiffel', + 'Erlang', 'Falcon', 'Flex', 'Fortran', 'gdbinit [disabled]', + 'Glade', 'Go', 'HTML', 'Iniconf', 'Java', 'JavaProperties', + 'JavaScript', 'JSON', 'Lisp', 'Lua', 'M4', 'Make', 'man [disabled]', + 'MatLab', 'Maven2', 'Myrddin', 'ObjectiveC', 'OCaml', 'OldC + [disabled]', 'OldC++ [disabled]', 'Pascal', 'Perl', 'Perl6', 'PHP', + 'PlistXML', 'pod', 'Protobuf', 'Python', 'PythonLoggingConfig', 'R', + 'RelaxNG', 'reStructuredText', 'REXX', 'RpmSpec', 'Ruby', 'Rust', + 'Scheme', 'Sh', 'SLang', 'SML', 'SQL', 'SVG', 'SystemdUnit', + 'SystemVerilog', 'Tcl', 'Tex', 'TTCN', 'Vera', 'Verilog', 'VHDL', + 'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir' +); +comment on type ctags_languages is 'Languages recognized by ctags indexer'; diff --git a/sql/swh-func.sql b/sql/swh-func.sql new file mode 100644 index 0000000..62df8fa --- /dev/null +++ b/sql/swh-func.sql @@ -0,0 +1,721 @@ +-- create a temporary table with a single "bytea" column for fast object lookup. +create or replace function swh_mktemp_bytea() + returns void + language sql +as $$ + create temporary table tmp_bytea ( + id bytea + ) on commit drop; +$$; + +-- create a temporary table called tmp_TBLNAME, mimicking existing table +-- TBLNAME +-- +-- Args: +-- tblname: name of the table to mimick +create or replace function swh_mktemp(tblname regclass) + returns void + language plpgsql +as $$ +begin + execute format(' + create temporary table tmp_%1$I + (like %1$I including defaults) + on commit drop; + alter table tmp_%1$I drop column if exists object_id; + ', tblname); + return; +end +$$; + +-- create a temporary table for content_ctags tmp_content_mimetype_missing, +create or replace function swh_mktemp_content_mimetype_missing() + returns void + language sql +as $$ + create temporary table tmp_content_mimetype_missing ( + id sha1, + indexer_configuration_id bigint + ) on commit drop; +$$; + +comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; + +-- check which entries of tmp_bytea are missing from content_mimetype +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_mimetype_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + (select id::sha1 from tmp_content_mimetype_missing as tmp + where not exists + (select 1 from content_mimetype as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); + return; +end +$$; + +comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; + +-- create a temporary table for content_mimetype tmp_content_mimetype, +create or replace function swh_mktemp_content_mimetype() + returns void + language sql +as $$ + create temporary table tmp_content_mimetype ( + like content_mimetype including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information'; + +-- add tmp_content_mimetype entries to content_mimetype, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_mimetype_missing must take place before calling this +-- function. +-- +-- +-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype, +-- 2. call this function +create or replace function swh_content_mimetype_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) + do update set mimetype = excluded.mimetype, + encoding = excluded.encoding; + + else + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) do nothing; + end if; + return; +end +$$; + +comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; + +create type content_mimetype_signature as( + id sha1, + mimetype bytea, + encoding bytea, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content mimetype from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_mimetype_get() + returns setof content_mimetype_signature + language plpgsql +as $$ +begin + return query + select c.id, mimetype, encoding, + i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_mimetype c on c.id=t.id + inner join indexer_configuration i on c.indexer_configuration_id=i.id; + return; +end +$$; + +comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; + +-- create a temporary table for content_language tmp_content_language, +create or replace function swh_mktemp_content_language_missing() + returns void + language sql +as $$ + create temporary table tmp_content_language_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; + +-- check which entries of tmp_bytea are missing from content_language +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_language_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_language_missing as tmp + where not exists + (select 1 from content_language as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_language_missing() IS 'Filter missing content languages'; + +-- add tmp_content_language entries to content_language, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_language_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_language, 2. call this function +create or replace function swh_content_language_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do update set lang = excluded.lang; + + else + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_language_add(boolean) IS 'Add new content languages'; + +-- create a temporary table for retrieving content_language +create or replace function swh_mktemp_content_language() + returns void + language sql +as $$ + create temporary table tmp_content_language ( + like content_language including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_language() is 'Helper table to add content language'; + +create type content_language_signature as ( + id sha1, + lang languages, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content language from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_language_get() + returns setof content_language_signature + language plpgsql +as $$ +begin + return query + select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_language c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_language_get() is 'List content''s language'; + + +-- create a temporary table for content_ctags tmp_content_ctags, +create or replace function swh_mktemp_content_ctags() + returns void + language sql +as $$ + create temporary table tmp_content_ctags ( + like content_ctags including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; + + +-- add tmp_content_ctags entries to content_ctags, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, +-- 2. call this function +create or replace function swh_content_ctags_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + delete from content_ctags + where id in (select tmp.id + from tmp_content_ctags tmp + inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); + end if; + + insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) + select id, name, kind, line, lang, indexer_configuration_id + from tmp_content_ctags tct + on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) + do nothing; + return; +end +$$; + +comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; + +-- create a temporary table for content_ctags missing routine +create or replace function swh_mktemp_content_ctags_missing() + returns void + language sql +as $$ + create temporary table tmp_content_ctags_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; + +-- check which entries of tmp_bytea are missing from content_ctags +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_ctags_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + (select id::sha1 from tmp_content_ctags_missing as tmp + where not exists + (select 1 from content_ctags as c + where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id + limit 1)); + return; +end +$$; + +comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; + +create type content_ctags_signature as ( + id sha1, + name text, + kind text, + line bigint, + lang ctags_languages, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content ctags from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_ctags_get() + returns setof content_ctags_signature + language plpgsql +as $$ +begin + return query + select c.id, c.name, c.kind, c.line, c.lang, + i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration + from tmp_bytea t + inner join content_ctags c using(id) + inner join indexer_configuration i on i.id = c.indexer_configuration_id + order by line; + return; +end +$$; + +comment on function swh_content_ctags_get() IS 'List content ctags'; + +-- Search within ctags content. +-- +create or replace function swh_content_ctags_search( + expression text, + l integer default 10, + last_sha1 sha1 default '\x0000000000000000000000000000000000000000') + returns setof content_ctags_signature + language sql +as $$ + select c.id, name, kind, line, lang, + i.id as tool_id, tool_name, tool_version, tool_configuration + from content_ctags c + inner join indexer_configuration i on i.id = c.indexer_configuration_id + where hash_sha1(name) = hash_sha1(expression) + and c.id > last_sha1 + order by id + limit l; +$$; + +comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols'; + + +-- create a temporary table for content_fossology_license tmp_content_fossology_license, +create or replace function swh_mktemp_content_fossology_license() + returns void + language sql +as $$ + create temporary table tmp_content_fossology_license ( + id sha1, + license text, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; + +-- add tmp_content_fossology_license entries to content_fossology_license, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to +-- tmp_content_fossology_license, 2. call this function +create or replace function swh_content_fossology_license_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + -- insert unknown licenses first + insert into fossology_license (name) + select distinct license from tmp_content_fossology_license tmp + where not exists (select 1 from fossology_license where name=tmp.license) + on conflict(name) do nothing; + + if conflict_update then + -- delete from content_fossology_license c + -- using tmp_content_fossology_license tmp, indexer_configuration i + -- where c.id = tmp.id and i.id=tmp.indexer_configuration_id + delete from content_fossology_license + where id in (select tmp.id + from tmp_content_fossology_license tmp + inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); + end if; + + insert into content_fossology_license (id, license_id, indexer_configuration_id) + select tcl.id, + (select id from fossology_license where name = tcl.license) as license, + indexer_configuration_id + from tmp_content_fossology_license tcl + on conflict(id, license_id, indexer_configuration_id) + do nothing; + return; +end +$$; + +comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; + +create type content_fossology_license_signature as ( + id sha1, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb, + licenses text[] +); + +-- Retrieve list of content license from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_fossology_license_get() + returns setof content_fossology_license_signature + language plpgsql +as $$ +begin + return query + select cl.id, + ic.id as tool_id, + ic.tool_name, + ic.tool_version, + ic.tool_configuration, + array(select name + from fossology_license + where id = ANY(array_agg(cl.license_id))) as licenses + from tmp_bytea tcl + inner join content_fossology_license cl using(id) + inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id + group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; + return; +end +$$; + +comment on function swh_content_fossology_license_get() IS 'List content licenses'; + +-- content_metadata functions +-- +-- create a temporary table for content_metadata tmp_content_metadata, +create or replace function swh_mktemp_content_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_content_metadata_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; + +-- check which entries of tmp_bytea are missing from content_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_metadata_missing as tmp + where not exists + (select 1 from content_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_content_metadata entries to content_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_metadata, 2. call this function +create or replace function swh_content_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; + +-- create a temporary table for retrieving content_metadata +create or replace function swh_mktemp_content_metadata() + returns void + language sql +as $$ + create temporary table tmp_content_metadata ( + like content_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; + +-- +create type content_metadata_signature as ( + id sha1, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_metadata_get() + returns setof content_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_metadata_get() is 'List content''s metadata'; +-- end content_metadata functions + +-- revision_metadata functions +-- +-- create a temporary table for revision_metadata tmp_revision_metadata, +create or replace function swh_mktemp_revision_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata_missing ( + id sha1_git, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; + +-- check which entries of tmp_bytea are missing from revision_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_revision_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_revision_metadata_missing as tmp + where not exists + (select 1 from revision_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_revision_metadata entries to revision_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_revision_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_revision_metadata, 2. call this function +create or replace function swh_revision_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; + +-- create a temporary table for retrieving revision_metadata +create or replace function swh_mktemp_revision_metadata() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata ( + like revision_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; + +-- +create type revision_metadata_signature as ( + id sha1_git, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of revision metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_revision_metadata_get() + returns setof revision_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join revision_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +create or replace function swh_mktemp_indexer_configuration() + returns void + language sql +as $$ + create temporary table tmp_indexer_configuration ( + like indexer_configuration including defaults + ) on commit drop; + alter table tmp_indexer_configuration drop column id; +$$; + + +-- add tmp_indexer_configuration entries to indexer_configuration, +-- skipping duplicates if any. +-- +-- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to +-- it, 2. call this function to insert and filtering out duplicates +create or replace function swh_indexer_configuration_add() + returns setof indexer_configuration + language plpgsql +as $$ +begin + insert into indexer_configuration(tool_name, tool_version, tool_configuration) + select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp + on conflict(tool_name, tool_version, tool_configuration) do nothing; + + return query + select id, tool_name, tool_version, tool_configuration + from tmp_indexer_configuration join indexer_configuration + using(tool_name, tool_version, tool_configuration); + + return; +end +$$; diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql new file mode 100644 index 0000000..addb720 --- /dev/null +++ b/sql/swh-indexes.sql @@ -0,0 +1,57 @@ +-- fossology_license +create unique index fossology_license_pkey on fossology_license(id); +alter table fossology_license add primary key using index fossology_license_pkey; + +create unique index on fossology_license(name); + +-- indexer_configuration +create unique index concurrently indexer_configuration_pkey on indexer_configuration(id); +alter table indexer_configuration add primary key using index indexer_configuration_pkey; + +create unique index on indexer_configuration(tool_name, tool_version, tool_configuration); + +-- content_ctags +create index on content_ctags(id); +create index on content_ctags(hash_sha1(name)); +create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); + +alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; + +-- content_metadata +create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); +alter table content_metadata add primary key using index content_metadata_pkey; + +alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; + +-- revision_metadata +create unique index revision_metadata_pkey on revision_metadata(id, indexer_configuration_id); +alter table revision_metadata add primary key using index revision_metadata_pkey; + +alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; + +-- content_mimetype +create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); +alter table content_mimetype add primary key using index content_mimetype_pkey; + +alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; + +-- content_language +create unique index content_language_pkey on content_language(id, indexer_configuration_id); +alter table content_language add primary key using index content_language_pkey; + +alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_language validate constraint content_language_indexer_configuration_id_fkey; + +-- content_fossology_license +create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); +alter table content_fossology_license add primary key using index content_fossology_license_pkey; + +alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey; + +alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; diff --git a/sql/swh-init.sql b/sql/swh-init.sql new file mode 100644 index 0000000..e78ac3c --- /dev/null +++ b/sql/swh-init.sql @@ -0,0 +1,13 @@ +create extension if not exists btree_gist; +create extension if not exists pgcrypto; + +create or replace language plpgsql; +create or replace language plpython3u; + +create or replace function hash_sha1(text) +returns text +as $$ +select encode(digest($1, 'sha1'), 'hex') +$$ language sql strict immutable; + +comment on function hash_sha1(text) is 'Compute sha1 hash as text'; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql new file mode 100644 index 0000000..b950793 --- /dev/null +++ b/sql/swh-schema.sql @@ -0,0 +1,138 @@ +--- +--- Software Heritage Indexers Data Model +--- + +-- drop schema if exists swh cascade; +-- create schema swh; +-- set search_path to swh; + +create table dbversion +( + version int primary key, + release timestamptz, + description text +); + +insert into dbversion(version, release, description) + values(114, now(), 'Work In Progress'); +-- Computing metadata on sha1's contents + +-- a SHA1 checksum (not necessarily originating from Git) +create domain sha1 as bytea check (length(value) = 20); + +-- a Git object ID, i.e., a SHA1 checksum +create domain sha1_git as bytea check (length(value) = 20); + +create table indexer_configuration ( + id serial not null, + tool_name text not null, + tool_version text not null, + tool_configuration jsonb +); + +comment on table indexer_configuration is 'Indexer''s configuration version'; +comment on column indexer_configuration.id is 'Tool identifier'; +comment on column indexer_configuration.tool_version is 'Tool name'; +comment on column indexer_configuration.tool_version is 'Tool version'; +comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; + +-- Properties (mimetype, encoding, etc...) +create table content_mimetype ( + id sha1 not null, + mimetype bytea not null, + encoding bytea not null, + indexer_configuration_id bigint not null +); + +comment on table content_mimetype is 'Metadata associated to a raw content'; +comment on column content_mimetype.mimetype is 'Raw content Mimetype'; +comment on column content_mimetype.encoding is 'Raw content encoding'; +comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; + +-- Language metadata +create table content_language ( + id sha1 not null, + lang languages not null, + indexer_configuration_id bigint not null +); + +comment on table content_language is 'Language information on a raw content'; +comment on column content_language.lang is 'Language information'; +comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; + +-- ctags information per content +create table content_ctags ( + id sha1 not null, + name text not null, + kind text not null, + line bigint not null, + lang ctags_languages not null, + indexer_configuration_id bigint not null +); + +comment on table content_ctags is 'Ctags information on a raw content'; +comment on column content_ctags.id is 'Content identifier'; +comment on column content_ctags.name is 'Symbol name'; +comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; +comment on column content_ctags.line is 'Symbol line'; +comment on column content_ctags.lang is 'Language information for that content'; +comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; + +create table fossology_license( + id smallserial, + name text not null +); + +comment on table fossology_license is 'Possible license recognized by license indexer'; +comment on column fossology_license.id is 'License identifier'; +comment on column fossology_license.name is 'License name'; + +create table content_fossology_license ( + id sha1 not null, + license_id smallserial not null, + indexer_configuration_id bigint not null +); + +comment on table content_fossology_license is 'license associated to a raw content'; +comment on column content_fossology_license.id is 'Raw content identifier'; +comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; +comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; + + +-- The table content_metadata provides a translation to files +-- identified as potentially containning metadata with a translation tool (indexer_configuration_id) +create table content_metadata( + id sha1 not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table content_metadata is 'metadata semantically translated from a content file'; +comment on column content_metadata.id is 'sha1 of content file'; +comment on column content_metadata.translated_metadata is 'result of translation with defined format'; +comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; + +-- The table revision_metadata provides a minimal set of intrinsic metadata +-- detected with the detection tool (indexer_configuration_id) and aggregated +-- from the content_metadata translation. +create table revision_metadata( + id sha1_git not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; +comment on column revision_metadata.id is 'sha1_git of revision'; +comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; +comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; + +create table origin_metadata_translation( + id bigserial not null, -- PK origin_metadata identifier + result jsonb, + tool_id bigint +); + +comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; +comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; +comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; +comment on column origin_metadata_translation.tool_id is 'tool used for translation'; diff --git a/sql/upgrades/114.sql b/sql/upgrades/114.sql new file mode 100644 index 0000000..7699a6d --- /dev/null +++ b/sql/upgrades/114.sql @@ -0,0 +1,8 @@ +create sequence origin_metadata_translation_id_seq + start with 1 + increment by 1 + no maxvalue + no minvalue + cache 1; + +select setval('fossology_license_id_seq', 833, true);