diff --git a/.gitignore b/.gitignore index f5fc2ae..43c4b92 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ *.pyc *.sw? *~ .coverage .eggs/ __pycache__ *.egg-info/ -version.txt \ No newline at end of file +version.txt +/sql/createdb-stamp +/sql/filldb-stamp diff --git a/PKG-INFO b/PKG-INFO index 99942cf..2481b88 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.indexer -Version: 0.0.43 +Version: 0.0.44 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/rules b/debian/rules index 5383204..aa268db 100755 --- a/debian/rules +++ b/debian/rules @@ -1,11 +1,11 @@ #!/usr/bin/make -f export PYBUILD_NAME=swh.indexer -export PYBUILD_TEST_ARGS=--with-doctest -sv -a !db,!fs +export PYBUILD_TEST_ARGS=-sv -a !db,!fs %: dh $@ --with python3 --buildsystem=pybuild override_dh_install: dh_install rm -v $(CURDIR)/debian/python3-*/usr/lib/python*/dist-packages/swh/__init__.py diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..d52d181 --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,43 @@ +# Depends: postgresql-client, postgresql-autodoc + +DBNAME = softwareheritage-indexer-dev +DOCDIR = autodoc + +SQL_INIT = swh-init.sql +SQL_ENUMS = swh-enums.sql +SQL_SCHEMA = swh-schema.sql +SQL_FUNC = swh-func.sql +SQL_DATA = swh-data.sql +SQL_INDEX = swh-indexes.sql +SQLS = $(SQL_INIT) $(SQL_ENUMS) $(SQL_SCHEMA) $(SQL_FUNC) $(SQL_INDEX) $(SQL_DATA) + +PSQL_BIN = psql +PSQL_FLAGS = --echo-all -X -v ON_ERROR_STOP=1 +PSQL = $(PSQL_BIN) $(PSQL_FLAGS) + +all: + +createdb: createdb-stamp +createdb-stamp: $(SQL_INIT) + createdb $(DBNAME) + touch $@ + +filldb: filldb-stamp +filldb-stamp: createdb-stamp + cat $(SQLS) | $(PSQL) $(DBNAME) + touch $@ + +dropdb: + -dropdb $(DBNAME) + +dumpdb: swh-indexer.dump +swh-indexer.dump: filldb-stamp + pg_dump -Fc $(DBNAME) > $@ + +clean: + rm -rf *-stamp $(DOCDIR)/ + +distclean: clean dropdb + rm -f swh-indexer.dump + +.PHONY: all initdb createdb dropdb doc clean diff --git a/sql/bin/db-upgrade b/sql/bin/db-upgrade new file mode 100755 index 0000000..1dd4e2b --- /dev/null +++ b/sql/bin/db-upgrade @@ -0,0 +1,73 @@ +#!/bin/bash + +# Compute a draft upgrade script for the DB schema, based on Git revisions. + +# Depends: apgdiff + +set -e + +SQLS="swh-*.sql" +VERSION_SQL="swh-schema.sql" +UPGRADE_DIR="upgrades" +DB_NAME="softwareheritage-dev" + +usage () { + echo "Usage: db-upgrade GIT_REV_FROM [GIT_REV_TO]" + echo "Example: db-upgrade HEAD^" + echo " db-upgrade HEAD~4 HEAD~2" + echo "See also: gitrevisions(7)" + exit 1 +} + +pg_dump_revision () { + rev="$1" + dump="$2" + + echo "checking out revision $rev, and dumping DB at the time..." + if [ "$rev" != "HEAD" ] ; then + git checkout --quiet "$rev" + fi + make distclean filldb > /dev/null + pg_dump "$DB_NAME" > "$dump" + if [ "$rev" != "HEAD" ] ; then + git checkout --quiet - + fi +} + +# argument parsing +if [ -z "$1" ] ; then + usage +fi +from_rev="$1" +shift 1 +if [ -z "$1" ] ; then + to_rev="HEAD" +else + to_rev="$1" + shift 1 +fi + +old_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) +new_dump=$(mktemp tmp.swh-db-upgrade.XXXXXXXXXX) +trap "rm -f $old_dump $new_dump" EXIT + +schema_version=$(grep -i -A 1 '^insert into dbversion' "$VERSION_SQL" | tail -n 1 \ + | sed -e 's/.*values(//i' -e 's/,.*//') +upgrade_script=$(mktemp -p "$UPGRADE_DIR" $(printf '%.03d' ${schema_version}).XXXX.sql) +pg_dump_revision "$from_rev" "$old_dump" +pg_dump_revision "$to_rev" "$new_dump" + +cat > "$upgrade_script" <> "$upgrade_script" + +echo "all done." +echo "Draft upgrade script is at: ${upgrade_script}" diff --git a/sql/bin/dot_add_content b/sql/bin/dot_add_content new file mode 100755 index 0000000..fc24e38 --- /dev/null +++ b/sql/bin/dot_add_content @@ -0,0 +1,15 @@ +#!/bin/bash + +DOT_FILE="$1" +DOT_EXTRA="$2" +if [ -z "$DOT_FILE" -o -z "$DOT_EXTRA" ] ; then + echo "Usage: $0 DOT_FILE DOT_EXTRA" + exit 1 +fi + +schema_version=$(grep -i -A 1 '^insert into dbversion' swh-schema.sql | tail -n 1 \ + | sed -e 's/.*values(//i' -e 's/,.*//') + +head -n -1 "$DOT_FILE" # all of $DOT_FILE but last line +sed "s/@@VERSION@@/$schema_version/" "$DOT_EXTRA" +echo "}" diff --git a/sql/json/.gitignore b/sql/json/.gitignore new file mode 100644 index 0000000..c337aa9 --- /dev/null +++ b/sql/json/.gitignore @@ -0,0 +1 @@ +*-stamp diff --git a/sql/json/Makefile b/sql/json/Makefile new file mode 100644 index 0000000..5d983b8 --- /dev/null +++ b/sql/json/Makefile @@ -0,0 +1,19 @@ +# Depends: json-glib-tools + +JSONVAL = json-glib-validate +JSONS = $(wildcard *.json) + +all: validate +check: validate +test: validate + +validate: validate-stamp +validate-stamp: $(JSONS) + make $(patsubst %,validate/%,$?) + touch $@ + +validate/%: + $(JSONVAL) $* + +clean: + rm -f validate-stamp diff --git a/sql/json/indexer_configuration.tool_configuration.schema.json b/sql/json/indexer_configuration.tool_configuration.schema.json new file mode 100644 index 0000000..28396b4 --- /dev/null +++ b/sql/json/indexer_configuration.tool_configuration.schema.json @@ -0,0 +1,11 @@ +{ + "$schema": "http://json-schema.org/schema#", + "id": "http://softwareheritage.org/schemas/indexer_configuration.tool_configuration.schema.json", + + "type": "object", + "properties": { + "command_line": { + "type": "string" + } + } +} diff --git a/sql/json/revision_metadata.translated_metadata.json b/sql/json/revision_metadata.translated_metadata.json new file mode 100644 index 0000000..1806fc7 --- /dev/null +++ b/sql/json/revision_metadata.translated_metadata.json @@ -0,0 +1,59 @@ +{ + "$schema": "http://json-schema.org/schema#", + "id": "http://softwareheritage.org/schemas/revision_metadata.translated_metadata.schema.json", + + "type": "object", + "properties": { + "developmentStatus": { + "type": "list" + }, + "version": { + "type": "list" + }, + "operatingSystem": { + "type": "list" + }, + "description": { + "type": "list" + }, + "keywords": { + "type": "list" + }, + "issueTracker": { + "type": "list" + }, + "name": { + "type": "list" + }, + "author": { + "type": "list" + }, + "relatedLink": { + "type": "list" + }, + "url": { + "type": "list" + }, + "type": { + "type": "list" + }, + "license": { + "type": "list" + }, + "maintainer": { + "type": "list" + }, + "email": { + "type": "list" + }, + "softwareRequirements": { + "type": "list" + }, + "identifier": { + "type": "list" + }, + "codeRepository": { + "type": "list" + }, + } +} diff --git a/sql/swh-data.sql b/sql/swh-data.sql new file mode 100644 index 0000000..e429343 --- /dev/null +++ b/sql/swh-data.sql @@ -0,0 +1,26 @@ +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('nomos', '3.1.0rc2-31-ga2cbb8c', '{"command_line": "nomossa "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('file', '5.22', '{"command_line": "file --mime "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('universal-ctags', '~git7859817b', '{"command_line": "ctags --fields=+lnz --sort=no --links=no --output-format=json "}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('pygments', '2.0.1+dfsg-1.1+deb8u1', '{"type": "library", "debian-package": "python3-pygments", "max_content_size": 10240}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-translator', '0.0.1', '{"type": "local", "context": "npm"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-metadata-detector', '0.0.1', '{"type": "local", "context": ["npm", "codemeta"]}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('swh-deposit', '0.0.1', '{"sword_version": "2"}'); + +insert into indexer_configuration(tool_name, tool_version, tool_configuration) +values ('file', '1:5.30-1+deb9u1', '{"type": "library", "debian-package": "python3-magic"}'); diff --git a/sql/swh-enums.sql b/sql/swh-enums.sql new file mode 100644 index 0000000..a357eb5 --- /dev/null +++ b/sql/swh-enums.sql @@ -0,0 +1,100 @@ +create type languages as enum ( 'abap', 'abnf', 'actionscript', + 'actionscript-3', 'ada', 'adl', 'agda', 'alloy', 'ambienttalk', + 'antlr', 'antlr-with-actionscript-target', 'antlr-with-c#-target', + 'antlr-with-cpp-target', 'antlr-with-java-target', + 'antlr-with-objectivec-target', 'antlr-with-perl-target', + 'antlr-with-python-target', 'antlr-with-ruby-target', 'apacheconf', + 'apl', 'applescript', 'arduino', 'aspectj', 'aspx-cs', 'aspx-vb', + 'asymptote', 'autohotkey', 'autoit', 'awk', 'base-makefile', 'bash', + 'bash-session', 'batchfile', 'bbcode', 'bc', 'befunge', + 'blitzbasic', 'blitzmax', 'bnf', 'boo', 'boogie', 'brainfuck', + 'bro', 'bugs', 'c', 'c#', 'c++', 'c-objdump', 'ca65-assembler', + 'cadl', 'camkes', 'cbm-basic-v2', 'ceylon', 'cfengine3', + 'cfstatement', 'chaiscript', 'chapel', 'cheetah', 'cirru', 'clay', + 'clojure', 'clojurescript', 'cmake', 'cobol', 'cobolfree', + 'coffeescript', 'coldfusion-cfc', 'coldfusion-html', 'common-lisp', + 'component-pascal', 'coq', 'cpp-objdump', 'cpsa', 'crmsh', 'croc', + 'cryptol', 'csound-document', 'csound-orchestra', 'csound-score', + 'css', 'css+django/jinja', 'css+genshi-text', 'css+lasso', + 'css+mako', 'css+mozpreproc', 'css+myghty', 'css+php', 'css+ruby', + 'css+smarty', 'cuda', 'cypher', 'cython', 'd', 'd-objdump', + 'darcs-patch', 'dart', 'debian-control-file', 'debian-sourcelist', + 'delphi', 'dg', 'diff', 'django/jinja', 'docker', 'dtd', 'duel', + 'dylan', 'dylan-session', 'dylanlid', 'earl-grey', 'easytrieve', + 'ebnf', 'ec', 'ecl', 'eiffel', 'elixir', 'elixir-iex-session', + 'elm', 'emacslisp', 'embedded-ragel', 'erb', 'erlang', + 'erlang-erl-session', 'evoque', 'ezhil', 'factor', 'fancy', + 'fantom', 'felix', 'fish', 'fortran', 'fortranfixed', 'foxpro', + 'fsharp', 'gap', 'gas', 'genshi', 'genshi-text', 'gettext-catalog', + 'gherkin', 'glsl', 'gnuplot', 'go', 'golo', 'gooddata-cl', 'gosu', + 'gosu-template', 'groff', 'groovy', 'haml', 'handlebars', 'haskell', + 'haxe', 'hexdump', 'html', 'html+cheetah', 'html+django/jinja', + 'html+evoque', 'html+genshi', 'html+handlebars', 'html+lasso', + 'html+mako', 'html+myghty', 'html+php', 'html+smarty', 'html+twig', + 'html+velocity', 'http', 'hxml', 'hy', 'hybris', 'idl', 'idris', + 'igor', 'inform-6', 'inform-6-template', 'inform-7', 'ini', 'io', + 'ioke', 'irc-logs', 'isabelle', 'j', 'jade', 'jags', 'jasmin', + 'java', 'java-server-page', 'javascript', 'javascript+cheetah', + 'javascript+django/jinja', 'javascript+genshi-text', + 'javascript+lasso', 'javascript+mako', 'javascript+mozpreproc', + 'javascript+myghty', 'javascript+php', 'javascript+ruby', + 'javascript+smarty', 'jcl', 'json', 'json-ld', 'julia', + 'julia-console', 'kal', 'kconfig', 'koka', 'kotlin', 'lasso', + 'lean', 'lesscss', 'lighttpd-configuration-file', 'limbo', 'liquid', + 'literate-agda', 'literate-cryptol', 'literate-haskell', + 'literate-idris', 'livescript', 'llvm', 'logos', 'logtalk', 'lsl', + 'lua', 'makefile', 'mako', 'maql', 'mask', 'mason', 'mathematica', + 'matlab', 'matlab-session', 'minid', 'modelica', 'modula-2', + 'moinmoin/trac-wiki-markup', 'monkey', 'moocode', 'moonscript', + 'mozhashpreproc', 'mozpercentpreproc', 'mql', 'mscgen', + 'msdos-session', 'mupad', 'mxml', 'myghty', 'mysql', 'nasm', + 'nemerle', 'nesc', 'newlisp', 'newspeak', + 'nginx-configuration-file', 'nimrod', 'nit', 'nix', 'nsis', 'numpy', + 'objdump', 'objdump-nasm', 'objective-c', 'objective-c++', + 'objective-j', 'ocaml', 'octave', 'odin', 'ooc', 'opa', + 'openedge-abl', 'pacmanconf', 'pan', 'parasail', 'pawn', 'perl', + 'perl6', 'php', 'pig', 'pike', 'pkgconfig', 'pl/pgsql', + 'postgresql-console-(psql)', 'postgresql-sql-dialect', 'postscript', + 'povray', 'powershell', 'powershell-session', 'praat', 'prolog', + 'properties', 'protocol-buffer', 'puppet', 'pypy-log', 'python', + 'python-3', 'python-3.0-traceback', 'python-console-session', + 'python-traceback', 'qbasic', 'qml', 'qvto', 'racket', 'ragel', + 'ragel-in-c-host', 'ragel-in-cpp-host', 'ragel-in-d-host', + 'ragel-in-java-host', 'ragel-in-objective-c-host', + 'ragel-in-ruby-host', 'raw-token-data', 'rconsole', 'rd', 'rebol', + 'red', 'redcode', 'reg', 'resourcebundle', 'restructuredtext', + 'rexx', 'rhtml', 'roboconf-graph', 'roboconf-instances', + 'robotframework', 'rpmspec', 'rql', 'rsl', 'ruby', + 'ruby-irb-session', 'rust', 's', 'sass', 'scala', + 'scalate-server-page', 'scaml', 'scheme', 'scilab', 'scss', 'shen', + 'slim', 'smali', 'smalltalk', 'smarty', 'snobol', 'sourcepawn', + 'sparql', 'sql', 'sqlite3con', 'squidconf', 'stan', 'standard-ml', + 'supercollider', 'swift', 'swig', 'systemverilog', 'tads-3', 'tap', + 'tcl', 'tcsh', 'tcsh-session', 'tea', 'termcap', 'terminfo', + 'terraform', 'tex', 'text-only', 'thrift', 'todotxt', + 'trafficscript', 'treetop', 'turtle', 'twig', 'typescript', + 'urbiscript', 'vala', 'vb.net', 'vctreestatus', 'velocity', + 'verilog', 'vgl', 'vhdl', 'viml', 'x10', 'xml', 'xml+cheetah', + 'xml+django/jinja', 'xml+evoque', 'xml+lasso', 'xml+mako', + 'xml+myghty', 'xml+php', 'xml+ruby', 'xml+smarty', 'xml+velocity', + 'xquery', 'xslt', 'xtend', 'xul+mozpreproc', 'yaml', 'yaml+jinja', + 'zephir', 'unknown' +); +comment on type languages is 'Languages recognized by language indexer'; + +create type ctags_languages as enum ( 'Ada', 'AnsiblePlaybook', 'Ant', + 'Asm', 'Asp', 'Autoconf', 'Automake', 'Awk', 'Basic', 'BETA', 'C', + 'C#', 'C++', 'Clojure', 'Cobol', 'CoffeeScript [disabled]', 'CSS', + 'ctags', 'D', 'DBusIntrospect', 'Diff', 'DosBatch', 'DTS', 'Eiffel', + 'Erlang', 'Falcon', 'Flex', 'Fortran', 'gdbinit [disabled]', + 'Glade', 'Go', 'HTML', 'Iniconf', 'Java', 'JavaProperties', + 'JavaScript', 'JSON', 'Lisp', 'Lua', 'M4', 'Make', 'man [disabled]', + 'MatLab', 'Maven2', 'Myrddin', 'ObjectiveC', 'OCaml', 'OldC + [disabled]', 'OldC++ [disabled]', 'Pascal', 'Perl', 'Perl6', 'PHP', + 'PlistXML', 'pod', 'Protobuf', 'Python', 'PythonLoggingConfig', 'R', + 'RelaxNG', 'reStructuredText', 'REXX', 'RpmSpec', 'Ruby', 'Rust', + 'Scheme', 'Sh', 'SLang', 'SML', 'SQL', 'SVG', 'SystemdUnit', + 'SystemVerilog', 'Tcl', 'Tex', 'TTCN', 'Vera', 'Verilog', 'VHDL', + 'Vim', 'WindRes', 'XSLT', 'YACC', 'Yaml', 'YumRepo', 'Zephir' +); +comment on type ctags_languages is 'Languages recognized by ctags indexer'; diff --git a/sql/swh-func.sql b/sql/swh-func.sql new file mode 100644 index 0000000..62df8fa --- /dev/null +++ b/sql/swh-func.sql @@ -0,0 +1,721 @@ +-- create a temporary table with a single "bytea" column for fast object lookup. +create or replace function swh_mktemp_bytea() + returns void + language sql +as $$ + create temporary table tmp_bytea ( + id bytea + ) on commit drop; +$$; + +-- create a temporary table called tmp_TBLNAME, mimicking existing table +-- TBLNAME +-- +-- Args: +-- tblname: name of the table to mimick +create or replace function swh_mktemp(tblname regclass) + returns void + language plpgsql +as $$ +begin + execute format(' + create temporary table tmp_%1$I + (like %1$I including defaults) + on commit drop; + alter table tmp_%1$I drop column if exists object_id; + ', tblname); + return; +end +$$; + +-- create a temporary table for content_ctags tmp_content_mimetype_missing, +create or replace function swh_mktemp_content_mimetype_missing() + returns void + language sql +as $$ + create temporary table tmp_content_mimetype_missing ( + id sha1, + indexer_configuration_id bigint + ) on commit drop; +$$; + +comment on function swh_mktemp_content_mimetype_missing() IS 'Helper table to filter existing mimetype information'; + +-- check which entries of tmp_bytea are missing from content_mimetype +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_mimetype_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + (select id::sha1 from tmp_content_mimetype_missing as tmp + where not exists + (select 1 from content_mimetype as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id)); + return; +end +$$; + +comment on function swh_content_mimetype_missing() is 'Filter existing mimetype information'; + +-- create a temporary table for content_mimetype tmp_content_mimetype, +create or replace function swh_mktemp_content_mimetype() + returns void + language sql +as $$ + create temporary table tmp_content_mimetype ( + like content_mimetype including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_mimetype() IS 'Helper table to add mimetype information'; + +-- add tmp_content_mimetype entries to content_mimetype, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_mimetype_missing must take place before calling this +-- function. +-- +-- +-- operates in bulk: 0. swh_mktemp(content_mimetype), 1. COPY to tmp_content_mimetype, +-- 2. call this function +create or replace function swh_content_mimetype_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) + do update set mimetype = excluded.mimetype, + encoding = excluded.encoding; + + else + insert into content_mimetype (id, mimetype, encoding, indexer_configuration_id) + select id, mimetype, encoding, indexer_configuration_id + from tmp_content_mimetype tcm + on conflict(id, indexer_configuration_id) do nothing; + end if; + return; +end +$$; + +comment on function swh_content_mimetype_add(boolean) IS 'Add new content mimetypes'; + +create type content_mimetype_signature as( + id sha1, + mimetype bytea, + encoding bytea, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content mimetype from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_mimetype_get() + returns setof content_mimetype_signature + language plpgsql +as $$ +begin + return query + select c.id, mimetype, encoding, + i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_mimetype c on c.id=t.id + inner join indexer_configuration i on c.indexer_configuration_id=i.id; + return; +end +$$; + +comment on function swh_content_mimetype_get() IS 'List content''s mimetypes'; + +-- create a temporary table for content_language tmp_content_language, +create or replace function swh_mktemp_content_language_missing() + returns void + language sql +as $$ + create temporary table tmp_content_language_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_language_missing() is 'Helper table to filter missing language'; + +-- check which entries of tmp_bytea are missing from content_language +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_language_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_language_missing as tmp + where not exists + (select 1 from content_language as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_language_missing() IS 'Filter missing content languages'; + +-- add tmp_content_language entries to content_language, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_language_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_language, 2. call this function +create or replace function swh_content_language_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do update set lang = excluded.lang; + + else + insert into content_language (id, lang, indexer_configuration_id) + select id, lang, indexer_configuration_id + from tmp_content_language tcl + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_language_add(boolean) IS 'Add new content languages'; + +-- create a temporary table for retrieving content_language +create or replace function swh_mktemp_content_language() + returns void + language sql +as $$ + create temporary table tmp_content_language ( + like content_language including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_language() is 'Helper table to add content language'; + +create type content_language_signature as ( + id sha1, + lang languages, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content language from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_language_get() + returns setof content_language_signature + language plpgsql +as $$ +begin + return query + select c.id, lang, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_language c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_language_get() is 'List content''s language'; + + +-- create a temporary table for content_ctags tmp_content_ctags, +create or replace function swh_mktemp_content_ctags() + returns void + language sql +as $$ + create temporary table tmp_content_ctags ( + like content_ctags including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_ctags() is 'Helper table to add content ctags'; + + +-- add tmp_content_ctags entries to content_ctags, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- operates in bulk: 0. swh_mktemp(content_ctags), 1. COPY to tmp_content_ctags, +-- 2. call this function +create or replace function swh_content_ctags_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + delete from content_ctags + where id in (select tmp.id + from tmp_content_ctags tmp + inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); + end if; + + insert into content_ctags (id, name, kind, line, lang, indexer_configuration_id) + select id, name, kind, line, lang, indexer_configuration_id + from tmp_content_ctags tct + on conflict(id, hash_sha1(name), kind, line, lang, indexer_configuration_id) + do nothing; + return; +end +$$; + +comment on function swh_content_ctags_add(boolean) IS 'Add new ctags symbols per content'; + +-- create a temporary table for content_ctags missing routine +create or replace function swh_mktemp_content_ctags_missing() + returns void + language sql +as $$ + create temporary table tmp_content_ctags_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_ctags_missing() is 'Helper table to filter missing content ctags'; + +-- check which entries of tmp_bytea are missing from content_ctags +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_ctags_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + (select id::sha1 from tmp_content_ctags_missing as tmp + where not exists + (select 1 from content_ctags as c + where c.id = tmp.id and c.indexer_configuration_id=tmp.indexer_configuration_id + limit 1)); + return; +end +$$; + +comment on function swh_content_ctags_missing() IS 'Filter missing content ctags'; + +create type content_ctags_signature as ( + id sha1, + name text, + kind text, + line bigint, + lang ctags_languages, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content ctags from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_ctags_get() + returns setof content_ctags_signature + language plpgsql +as $$ +begin + return query + select c.id, c.name, c.kind, c.line, c.lang, + i.id as tool_id, i.tool_name, i.tool_version, i.tool_configuration + from tmp_bytea t + inner join content_ctags c using(id) + inner join indexer_configuration i on i.id = c.indexer_configuration_id + order by line; + return; +end +$$; + +comment on function swh_content_ctags_get() IS 'List content ctags'; + +-- Search within ctags content. +-- +create or replace function swh_content_ctags_search( + expression text, + l integer default 10, + last_sha1 sha1 default '\x0000000000000000000000000000000000000000') + returns setof content_ctags_signature + language sql +as $$ + select c.id, name, kind, line, lang, + i.id as tool_id, tool_name, tool_version, tool_configuration + from content_ctags c + inner join indexer_configuration i on i.id = c.indexer_configuration_id + where hash_sha1(name) = hash_sha1(expression) + and c.id > last_sha1 + order by id + limit l; +$$; + +comment on function swh_content_ctags_search(text, integer, sha1) IS 'Equality search through ctags'' symbols'; + + +-- create a temporary table for content_fossology_license tmp_content_fossology_license, +create or replace function swh_mktemp_content_fossology_license() + returns void + language sql +as $$ + create temporary table tmp_content_fossology_license ( + id sha1, + license text, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_fossology_license() is 'Helper table to add content license'; + +-- add tmp_content_fossology_license entries to content_fossology_license, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- operates in bulk: 0. swh_mktemp(content_fossology_license), 1. COPY to +-- tmp_content_fossology_license, 2. call this function +create or replace function swh_content_fossology_license_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + -- insert unknown licenses first + insert into fossology_license (name) + select distinct license from tmp_content_fossology_license tmp + where not exists (select 1 from fossology_license where name=tmp.license) + on conflict(name) do nothing; + + if conflict_update then + -- delete from content_fossology_license c + -- using tmp_content_fossology_license tmp, indexer_configuration i + -- where c.id = tmp.id and i.id=tmp.indexer_configuration_id + delete from content_fossology_license + where id in (select tmp.id + from tmp_content_fossology_license tmp + inner join indexer_configuration i on i.id=tmp.indexer_configuration_id); + end if; + + insert into content_fossology_license (id, license_id, indexer_configuration_id) + select tcl.id, + (select id from fossology_license where name = tcl.license) as license, + indexer_configuration_id + from tmp_content_fossology_license tcl + on conflict(id, license_id, indexer_configuration_id) + do nothing; + return; +end +$$; + +comment on function swh_content_fossology_license_add(boolean) IS 'Add new content licenses'; + +create type content_fossology_license_signature as ( + id sha1, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb, + licenses text[] +); + +-- Retrieve list of content license from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_fossology_license_get() + returns setof content_fossology_license_signature + language plpgsql +as $$ +begin + return query + select cl.id, + ic.id as tool_id, + ic.tool_name, + ic.tool_version, + ic.tool_configuration, + array(select name + from fossology_license + where id = ANY(array_agg(cl.license_id))) as licenses + from tmp_bytea tcl + inner join content_fossology_license cl using(id) + inner join indexer_configuration ic on ic.id=cl.indexer_configuration_id + group by cl.id, ic.id, ic.tool_name, ic.tool_version, ic.tool_configuration; + return; +end +$$; + +comment on function swh_content_fossology_license_get() IS 'List content licenses'; + +-- content_metadata functions +-- +-- create a temporary table for content_metadata tmp_content_metadata, +create or replace function swh_mktemp_content_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_content_metadata_missing ( + id sha1, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata_missing() is 'Helper table to filter missing metadata in content_metadata'; + +-- check which entries of tmp_bytea are missing from content_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_content_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_content_metadata_missing as tmp + where not exists + (select 1 from content_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_content_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_content_metadata entries to content_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_content_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_content_metadata, 2. call this function +create or replace function swh_content_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into content_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_content_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_content_metadata_add(boolean) IS 'Add new content metadata'; + +-- create a temporary table for retrieving content_metadata +create or replace function swh_mktemp_content_metadata() + returns void + language sql +as $$ + create temporary table tmp_content_metadata ( + like content_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_content_metadata() is 'Helper table to add content metadata'; + +-- +create type content_metadata_signature as ( + id sha1, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of content metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_content_metadata_get() + returns setof content_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join content_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +comment on function swh_content_metadata_get() is 'List content''s metadata'; +-- end content_metadata functions + +-- revision_metadata functions +-- +-- create a temporary table for revision_metadata tmp_revision_metadata, +create or replace function swh_mktemp_revision_metadata_missing() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata_missing ( + id sha1_git, + indexer_configuration_id integer + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata_missing() is 'Helper table to filter missing metadata in revision_metadata'; + +-- check which entries of tmp_bytea are missing from revision_metadata +-- +-- operates in bulk: 0. swh_mktemp_bytea(), 1. COPY to tmp_bytea, +-- 2. call this function +create or replace function swh_revision_metadata_missing() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select id::sha1 from tmp_revision_metadata_missing as tmp + where not exists + (select 1 from revision_metadata as c + where c.id = tmp.id and c.indexer_configuration_id = tmp.indexer_configuration_id); + return; +end +$$; + +comment on function swh_revision_metadata_missing() IS 'Filter missing content metadata'; + +-- add tmp_revision_metadata entries to revision_metadata, overwriting +-- duplicates if conflict_update is true, skipping duplicates otherwise. +-- +-- If filtering duplicates is in order, the call to +-- swh_revision_metadata_missing must take place before calling this +-- function. +-- +-- operates in bulk: 0. swh_mktemp(content_language), 1. COPY to +-- tmp_revision_metadata, 2. call this function +create or replace function swh_revision_metadata_add(conflict_update boolean) + returns void + language plpgsql +as $$ +begin + if conflict_update then + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do update set translated_metadata = excluded.translated_metadata; + + else + insert into revision_metadata (id, translated_metadata, indexer_configuration_id) + select id, translated_metadata, indexer_configuration_id + from tmp_revision_metadata tcm + on conflict(id, indexer_configuration_id) + do nothing; + end if; + return; +end +$$; + +comment on function swh_revision_metadata_add(boolean) IS 'Add new revision metadata'; + +-- create a temporary table for retrieving revision_metadata +create or replace function swh_mktemp_revision_metadata() + returns void + language sql +as $$ + create temporary table tmp_revision_metadata ( + like revision_metadata including defaults + ) on commit drop; +$$; + +comment on function swh_mktemp_revision_metadata() is 'Helper table to add revision metadata'; + +-- +create type revision_metadata_signature as ( + id sha1_git, + translated_metadata jsonb, + tool_id integer, + tool_name text, + tool_version text, + tool_configuration jsonb +); + +-- Retrieve list of revision metadata from the temporary table. +-- +-- operates in bulk: 0. mktemp(tmp_bytea), 1. COPY to tmp_bytea, 2. call this function +create or replace function swh_revision_metadata_get() + returns setof revision_metadata_signature + language plpgsql +as $$ +begin + return query + select c.id, translated_metadata, i.id as tool_id, tool_name, tool_version, tool_configuration + from tmp_bytea t + inner join revision_metadata c on c.id = t.id + inner join indexer_configuration i on i.id=c.indexer_configuration_id; + return; +end +$$; + +create or replace function swh_mktemp_indexer_configuration() + returns void + language sql +as $$ + create temporary table tmp_indexer_configuration ( + like indexer_configuration including defaults + ) on commit drop; + alter table tmp_indexer_configuration drop column id; +$$; + + +-- add tmp_indexer_configuration entries to indexer_configuration, +-- skipping duplicates if any. +-- +-- operates in bulk: 0. create temporary tmp_indexer_configuration, 1. COPY to +-- it, 2. call this function to insert and filtering out duplicates +create or replace function swh_indexer_configuration_add() + returns setof indexer_configuration + language plpgsql +as $$ +begin + insert into indexer_configuration(tool_name, tool_version, tool_configuration) + select tool_name, tool_version, tool_configuration from tmp_indexer_configuration tmp + on conflict(tool_name, tool_version, tool_configuration) do nothing; + + return query + select id, tool_name, tool_version, tool_configuration + from tmp_indexer_configuration join indexer_configuration + using(tool_name, tool_version, tool_configuration); + + return; +end +$$; diff --git a/sql/swh-indexes.sql b/sql/swh-indexes.sql new file mode 100644 index 0000000..addb720 --- /dev/null +++ b/sql/swh-indexes.sql @@ -0,0 +1,57 @@ +-- fossology_license +create unique index fossology_license_pkey on fossology_license(id); +alter table fossology_license add primary key using index fossology_license_pkey; + +create unique index on fossology_license(name); + +-- indexer_configuration +create unique index concurrently indexer_configuration_pkey on indexer_configuration(id); +alter table indexer_configuration add primary key using index indexer_configuration_pkey; + +create unique index on indexer_configuration(tool_name, tool_version, tool_configuration); + +-- content_ctags +create index on content_ctags(id); +create index on content_ctags(hash_sha1(name)); +create unique index on content_ctags(id, hash_sha1(name), kind, line, lang, indexer_configuration_id); + +alter table content_ctags add constraint content_ctags_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_ctags validate constraint content_ctags_indexer_configuration_id_fkey; + +-- content_metadata +create unique index content_metadata_pkey on content_metadata(id, indexer_configuration_id); +alter table content_metadata add primary key using index content_metadata_pkey; + +alter table content_metadata add constraint content_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_metadata validate constraint content_metadata_indexer_configuration_id_fkey; + +-- revision_metadata +create unique index revision_metadata_pkey on revision_metadata(id, indexer_configuration_id); +alter table revision_metadata add primary key using index revision_metadata_pkey; + +alter table revision_metadata add constraint revision_metadata_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table revision_metadata validate constraint revision_metadata_indexer_configuration_id_fkey; + +-- content_mimetype +create unique index content_mimetype_pkey on content_mimetype(id, indexer_configuration_id); +alter table content_mimetype add primary key using index content_mimetype_pkey; + +alter table content_mimetype add constraint content_mimetype_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_mimetype validate constraint content_mimetype_indexer_configuration_id_fkey; + +-- content_language +create unique index content_language_pkey on content_language(id, indexer_configuration_id); +alter table content_language add primary key using index content_language_pkey; + +alter table content_language add constraint content_language_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_language validate constraint content_language_indexer_configuration_id_fkey; + +-- content_fossology_license +create unique index content_fossology_license_pkey on content_fossology_license(id, license_id, indexer_configuration_id); +alter table content_fossology_license add primary key using index content_fossology_license_pkey; + +alter table content_fossology_license add constraint content_fossology_license_license_id_fkey foreign key (license_id) references fossology_license(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_license_id_fkey; + +alter table content_fossology_license add constraint content_fossology_license_indexer_configuration_id_fkey foreign key (indexer_configuration_id) references indexer_configuration(id) not valid; +alter table content_fossology_license validate constraint content_fossology_license_indexer_configuration_id_fkey; diff --git a/sql/swh-init.sql b/sql/swh-init.sql new file mode 100644 index 0000000..e78ac3c --- /dev/null +++ b/sql/swh-init.sql @@ -0,0 +1,13 @@ +create extension if not exists btree_gist; +create extension if not exists pgcrypto; + +create or replace language plpgsql; +create or replace language plpython3u; + +create or replace function hash_sha1(text) +returns text +as $$ +select encode(digest($1, 'sha1'), 'hex') +$$ language sql strict immutable; + +comment on function hash_sha1(text) is 'Compute sha1 hash as text'; diff --git a/sql/swh-schema.sql b/sql/swh-schema.sql new file mode 100644 index 0000000..b950793 --- /dev/null +++ b/sql/swh-schema.sql @@ -0,0 +1,138 @@ +--- +--- Software Heritage Indexers Data Model +--- + +-- drop schema if exists swh cascade; +-- create schema swh; +-- set search_path to swh; + +create table dbversion +( + version int primary key, + release timestamptz, + description text +); + +insert into dbversion(version, release, description) + values(114, now(), 'Work In Progress'); +-- Computing metadata on sha1's contents + +-- a SHA1 checksum (not necessarily originating from Git) +create domain sha1 as bytea check (length(value) = 20); + +-- a Git object ID, i.e., a SHA1 checksum +create domain sha1_git as bytea check (length(value) = 20); + +create table indexer_configuration ( + id serial not null, + tool_name text not null, + tool_version text not null, + tool_configuration jsonb +); + +comment on table indexer_configuration is 'Indexer''s configuration version'; +comment on column indexer_configuration.id is 'Tool identifier'; +comment on column indexer_configuration.tool_version is 'Tool name'; +comment on column indexer_configuration.tool_version is 'Tool version'; +comment on column indexer_configuration.tool_configuration is 'Tool configuration: command line, flags, etc...'; + +-- Properties (mimetype, encoding, etc...) +create table content_mimetype ( + id sha1 not null, + mimetype bytea not null, + encoding bytea not null, + indexer_configuration_id bigint not null +); + +comment on table content_mimetype is 'Metadata associated to a raw content'; +comment on column content_mimetype.mimetype is 'Raw content Mimetype'; +comment on column content_mimetype.encoding is 'Raw content encoding'; +comment on column content_mimetype.indexer_configuration_id is 'Tool used to compute the information'; + +-- Language metadata +create table content_language ( + id sha1 not null, + lang languages not null, + indexer_configuration_id bigint not null +); + +comment on table content_language is 'Language information on a raw content'; +comment on column content_language.lang is 'Language information'; +comment on column content_language.indexer_configuration_id is 'Tool used to compute the information'; + +-- ctags information per content +create table content_ctags ( + id sha1 not null, + name text not null, + kind text not null, + line bigint not null, + lang ctags_languages not null, + indexer_configuration_id bigint not null +); + +comment on table content_ctags is 'Ctags information on a raw content'; +comment on column content_ctags.id is 'Content identifier'; +comment on column content_ctags.name is 'Symbol name'; +comment on column content_ctags.kind is 'Symbol kind (function, class, variable, const...)'; +comment on column content_ctags.line is 'Symbol line'; +comment on column content_ctags.lang is 'Language information for that content'; +comment on column content_ctags.indexer_configuration_id is 'Tool used to compute the information'; + +create table fossology_license( + id smallserial, + name text not null +); + +comment on table fossology_license is 'Possible license recognized by license indexer'; +comment on column fossology_license.id is 'License identifier'; +comment on column fossology_license.name is 'License name'; + +create table content_fossology_license ( + id sha1 not null, + license_id smallserial not null, + indexer_configuration_id bigint not null +); + +comment on table content_fossology_license is 'license associated to a raw content'; +comment on column content_fossology_license.id is 'Raw content identifier'; +comment on column content_fossology_license.license_id is 'One of the content''s license identifier'; +comment on column content_fossology_license.indexer_configuration_id is 'Tool used to compute the information'; + + +-- The table content_metadata provides a translation to files +-- identified as potentially containning metadata with a translation tool (indexer_configuration_id) +create table content_metadata( + id sha1 not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table content_metadata is 'metadata semantically translated from a content file'; +comment on column content_metadata.id is 'sha1 of content file'; +comment on column content_metadata.translated_metadata is 'result of translation with defined format'; +comment on column content_metadata.indexer_configuration_id is 'tool used for translation'; + +-- The table revision_metadata provides a minimal set of intrinsic metadata +-- detected with the detection tool (indexer_configuration_id) and aggregated +-- from the content_metadata translation. +create table revision_metadata( + id sha1_git not null, + translated_metadata jsonb not null, + indexer_configuration_id bigint not null +); + +comment on table revision_metadata is 'metadata semantically detected and translated in a revision'; +comment on column revision_metadata.id is 'sha1_git of revision'; +comment on column revision_metadata.translated_metadata is 'result of detection and translation with defined format'; +comment on column revision_metadata.indexer_configuration_id is 'tool used for detection'; + +create table origin_metadata_translation( + id bigserial not null, -- PK origin_metadata identifier + result jsonb, + tool_id bigint +); + +comment on table origin_metadata_translation is 'keeps translated for an origin_metadata entry'; +comment on column origin_metadata_translation.id is 'the entry id in origin_metadata'; +comment on column origin_metadata_translation.result is 'translated_metadata result after translation with tool'; +comment on column origin_metadata_translation.tool_id is 'tool used for translation'; diff --git a/sql/upgrades/114.sql b/sql/upgrades/114.sql new file mode 100644 index 0000000..7699a6d --- /dev/null +++ b/sql/upgrades/114.sql @@ -0,0 +1,8 @@ +create sequence origin_metadata_translation_id_seq + start with 1 + increment by 1 + no maxvalue + no minvalue + cache 1; + +select setval('fossology_license_id_seq', 833, true); diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 99942cf..2481b88 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.indexer -Version: 0.0.43 +Version: 0.0.44 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.indexer.egg-info/SOURCES.txt b/swh.indexer.egg-info/SOURCES.txt index 81e40f6..e096619 100644 --- a/swh.indexer.egg-info/SOURCES.txt +++ b/swh.indexer.egg-info/SOURCES.txt @@ -1,46 +1,72 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile README codemeta.json requirements-swh.txt requirements.txt setup.py version.txt debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder +sql/Makefile +sql/swh-data.sql +sql/swh-enums.sql +sql/swh-func.sql +sql/swh-indexes.sql +sql/swh-init.sql +sql/swh-schema.sql +sql/bin/db-upgrade +sql/bin/dot_add_content +sql/doc/json +sql/json/.gitignore +sql/json/Makefile +sql/json/indexer_configuration.tool_configuration.schema.json +sql/json/revision_metadata.translated_metadata.json +sql/upgrades/114.sql swh/__init__.py swh.indexer.egg-info/PKG-INFO swh.indexer.egg-info/SOURCES.txt swh.indexer.egg-info/dependency_links.txt swh.indexer.egg-info/requires.txt swh.indexer.egg-info/top_level.txt swh/indexer/__init__.py swh/indexer/ctags.py swh/indexer/fossology_license.py swh/indexer/indexer.py swh/indexer/language.py swh/indexer/metadata.py swh/indexer/metadata_detector.py swh/indexer/metadata_dictionary.py swh/indexer/mimetype.py swh/indexer/orchestrator.py swh/indexer/producer.py swh/indexer/rehash.py swh/indexer/tasks.py +swh/indexer/storage/__init__.py +swh/indexer/storage/converters.py +swh/indexer/storage/db.py +swh/indexer/storage/api/__init__.py +swh/indexer/storage/api/client.py +swh/indexer/storage/api/server.py +swh/indexer/tests/__init__.py swh/indexer/tests/test_language.py swh/indexer/tests/test_metadata.py swh/indexer/tests/test_mimetype.py -swh/indexer/tests/test_utils.py \ No newline at end of file +swh/indexer/tests/test_utils.py +swh/indexer/tests/storage/__init__.py +swh/indexer/tests/storage/test_api_client.py +swh/indexer/tests/storage/test_converters.py +swh/indexer/tests/storage/test_storage.py \ No newline at end of file diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index a5f3dfd..b558a81 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,29 +1,55 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information INDEXER_CLASSES = { 'mimetype': 'swh.indexer.mimetype.ContentMimetypeIndexer', 'language': 'swh.indexer.language.ContentLanguageIndexer', 'ctags': 'swh.indexer.ctags.CtagsIndexer', 'fossology_license': 'swh.indexer.fossology_license.ContentFossologyLicenseIndexer', } TASK_NAMES = { 'orchestrator_all': 'swh.indexer.tasks.SWHOrchestratorAllContentsTask', 'orchestrator_text': 'swh.indexer.tasks.SWHOrchestratorTextContentsTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', 'language': 'swh.indexer.tasks.SWHContentLanguageTask', 'ctags': 'swh.indexer.tasks.SWHCtagsTask', 'fossology_license': 'swh.indexer.tasks.SWHContentFossologyLicenseTask', 'rehash': 'swh.indexer.tasks.SWHRecomputeChecksumsTask', } __all__ = [ 'INDEXER_CLASSES', 'TASK_NAMES', ] + + +def get_indexer_storage(cls, args): + """Get an indexer storage object of class `storage_class` with + arguments `storage_args`. + + Args: + storage (dict): dictionary with keys: + - cls (str): storage's class, either 'local' or 'remote' + - args (dict): dictionary with keys + + Returns: + an instance of swh.indexer's storage (either local or remote) + + Raises: + ValueError if passed an unknown storage class. + + """ + if cls == 'remote': + from .storage.api.client import RemoteStorage as IndexerStorage + elif cls == 'local': + from .storage import IndexerStorage + else: + raise ValueError('Unknown indexer storage class `%s`' % cls) + + return IndexerStorage(**args) diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index ec395f2..dde3740 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,161 +1,161 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess import json from swh.model import hashutil from .language import compute_language from .indexer import ContentIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Returns: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_ctags_missing(( + yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.content_ctags_add( + self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--path', help="Path to execute index on") def main(path): r = list(run_ctags(path)) print(r) if __name__ == '__main__': main() diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index d1f9db6..3d46407 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,141 +1,141 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.model import hashutil from .indexer import ContentIndexer, DiskIndexer def compute_license(path, log=None): """Determine license from file at path. Args: path: filepath to determine the license Returns: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath - tool (str): tool used to compute the output """ try: properties = subprocess.check_output(['nomossa', path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') return { 'licenses': licenses, 'path': path, } except subprocess.CalledProcessError: if log: from os import path as __path log.exception('Problem during license detection for sha1 %s' % __path.basename(path)) return { 'licenses': [], 'path': path, } class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_fossology_license_missing(( + yield from self.idx_storage.content_fossology_license_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path """ filename = hashutil.hash_to_hex(id) content_path = self.write_to_temp( filename=filename, data=data) try: properties = compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.content_fossology_license_add( + self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): print(compute_license(tool, path)) if __name__ == '__main__': main() diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py index 2cd850d..07cd85c 100644 --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -1,419 +1,418 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import os import logging import shutil import tempfile from swh.core.config import SWHConfig from swh.objstorage import get_objstorage from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil -from swh.storage import get_storage from swh.scheduler.utils import get_task +from swh.indexer import get_indexer_storage class DiskIndexer: """Mixin intended to be used with other SomethingIndexer classes. Indexers inheriting from this class are a category of indexers which needs the disk for their computations. Note: This expects `self.working_directory` variable defined at runtime. """ def write_to_temp(self, filename, data): """Write the sha1's content in a temporary file. Args: sha1 (str): the sha1 name filename (str): one of sha1's many filenames data (bytes): the sha1's content to write in temporary file Returns: The path to the temporary file created. That file is filled in with the raw content's data. """ os.makedirs(self.working_directory, exist_ok=True) temp_dir = tempfile.mkdtemp(dir=self.working_directory) content_path = os.path.join(temp_dir, filename) with open(content_path, 'wb') as f: f.write(data) return content_path def cleanup(self, content_path): """Remove content_path from working directory. Args: content_path (str): the file to remove """ temp_dir = os.path.dirname(content_path) shutil.rmtree(temp_dir) class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta): """Base class for indexers to inherit from. The main entry point is the :func:`run` function which is in charge of triggering the computations on the batch dict/ids received. Indexers can: - filter out ids whose data has already been indexed. - retrieve ids data from storage or objstorage - index this data depending on the object and store the result in storage. To implement a new object type indexer, inherit from the BaseIndexer and implement the process of indexation: :func:`run`: object_ids are different depending on object. For example: sha1 for content, sha1_git for revision, directory, release, and id for origin To implement a new concrete indexer, inherit from the object level classes: :class:`ContentIndexer`, :class:`RevisionIndexer` (later on :class:`OriginIndexer` will also be available) Then you need to implement the following functions: :func:`filter`: filter out data already indexed (in storage). This function is used by the orchestrator and not directly by the indexer (cf. swh.indexer.orchestrator.BaseOrchestratorIndexer). :func:`index_object`: compute index on id with data (retrieved from the storage or the objstorage by the id key) and return the resulting index computation. :func:`persist_index_computations`: persist the results of multiple index computations in the storage. The new indexer implementation can also override the following functions: :func:`prepare`: Configuration preparation for the indexer. When overriding, this must call the `super().prepare()` instruction. :func:`check`: Configuration check for the indexer. When overriding, this must call the `super().check()` instruction. :func:`register_tools`: This should return a dict of the tool(s) to use when indexing or filtering. """ CONFIG = 'indexer/base' DEFAULT_CONFIG = { - 'storage': ('dict', { - 'host': 'uffizi', + 'indexer_storage': ('dict', { 'cls': 'remote', - 'args': {'root': '/tmp/softwareheritage/objects', - 'slicing': '0:2/2:4/4:6'} + 'args': { + 'db': 'service=swh-indexer-dev' + } }), + # queue to reschedule if problem (none for no rescheduling, # the default) 'rescheduling_task': ('str', None), 'objstorage': ('dict', { 'cls': 'multiplexer', 'args': { 'objstorages': [{ 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '0euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '0'} ] } }, { 'cls': 'filtered', 'args': { 'storage_conf': { 'cls': 'azure-storage', 'args': { 'account_name': '1euwestswh', 'api_secret_key': 'secret', 'container_name': 'contents' } }, 'filters_conf': [ {'type': 'readonly'}, {'type': 'prefix', 'prefix': '1'} ] } }] }, }), } ADDITIONAL_CONFIG = {} def __init__(self): """Prepare and check that the indexer is ready to run. """ super().__init__() self.prepare() self.check() def prepare(self): """Prepare the indexer's needed runtime configuration. Without this step, the indexer cannot possibly run. """ self.config = self.parse_config_file( additional_configs=[self.ADDITIONAL_CONFIG]) objstorage = self.config['objstorage'] self.objstorage = get_objstorage(objstorage['cls'], objstorage['args']) - storage = self.config['storage'] - self.storage = get_storage(storage['cls'], storage['args']) + idx_storage = self.config['indexer_storage'] + self.idx_storage = get_indexer_storage(**idx_storage) rescheduling_task = self.config['rescheduling_task'] if rescheduling_task: self.rescheduling_task = get_task(rescheduling_task) else: self.rescheduling_task = None l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = list(self.register_tools(self.config['tools'])) def check(self): """Check the indexer's configuration is ok before proceeding. If ok, does nothing. If not raise error. """ if not self.tools: raise ValueError('Tools %s is unknown, cannot continue' % self.tools) def _prepare_tool(self, tool): """Prepare the tool dict to be compliant with the storage api. """ return {'tool_%s' % key: value for key, value in tool.items()} def register_tools(self, tools): """Permit to register tools to the storage. Add a sensible default which can be overridden if not sufficient. (For now, all indexers use only one tool) Expects the self.config['tools'] property to be set with one or more tools. Args: tools (dict/[dict]): Either a dict or a list of dict. Returns: List of dict with additional id key. Raises: ValueError if not a list nor a dict. """ tools = self.config['tools'] if isinstance(tools, list): tools = map(self._prepare_tool, tools) elif isinstance(tools, dict): tools = [self._prepare_tool(tools)] else: raise ValueError('Configuration tool(s) must be a dict or list!') - registered_tools = self.storage.indexer_configuration_add(tools) - return registered_tools + return self.idx_storage.indexer_configuration_add(tools) @abc.abstractmethod def filter(self, ids): """Filter missing ids for that particular indexer. Args: ids ([bytes]): list of ids Yields: iterator of missing ids """ pass @abc.abstractmethod def index(self, id, data): """Index computation for the id and associated raw data. Args: id (bytes): identifier data (bytes): id's data from storage or objstorage depending on object type Returns: a dict that makes sense for the persist_index_computations function. """ pass @abc.abstractmethod def persist_index_computations(self, results, policy_update): """Persist the computation resulting from the index. Args: results ([result]): List of results. One result is the result of the index function. policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them Returns: None """ pass def next_step(self, results): """Do something else with computations results (e.g. send to another queue, ...). (This is not an abstractmethod since it is optional). Args: results ([result]): List of results (dict) as returned by index function. Returns: None """ pass @abc.abstractmethod def run(self, ids, policy_update): """Given a list of ids: - retrieves the data from the storage - executes the indexing computations - stores the results (according to policy_update) Args: ids ([bytes]): id's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ pass class ContentIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements the process of indexation for Contents using the run method Note: the :class:`ContentIndexer` is not an instantiable object. To use it in another context, one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ def run(self, ids, policy_update): """Given a list of ids: - retrieve the content from the storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] try: for sha1 in ids: try: raw_content = self.objstorage.get(sha1) except ObjNotFoundError: self.log.warn('Content %s not found in objstorage' % hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content) if res: # If no results, skip it results.append(res) self.persist_index_computations(results, policy_update) self.next_step(results) except Exception: self.log.exception( 'Problem when reading contents metadata.') if self.rescheduling_task: self.log.warn('Rescheduling batch') self.rescheduling_task.delay(ids, policy_update) class RevisionIndexer(BaseIndexer): """An object type indexer, inherits from the :class:`BaseIndexer` and implements the process of indexation for Revisions using the run method Note: the :class:`RevisionIndexer` is not an instantiable object. To use it in another context one should inherit from this class and override the methods mentioned in the :class:`BaseIndexer` class. """ - def run(self, ids, policy_update): """Given a list of sha1_gits: - retrieve revisions from storage - execute the indexing computations - store the results (according to policy_update) Args: ids ([bytes]): sha1_git's identifier list policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ results = [] revs = self.storage.revision_get(ids) for rev in revs: if not rev: self.log.warn('Revisions %s not found in storage' % list(map(hashutil.hash_to_hex, ids))) continue try: res = self.index(rev) if res: # If no results, skip it results.append(res) except Exception: self.log.exception( 'Problem when processing revision') self.persist_index_computations(results, policy_update) diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 78d6b62..6433977 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,208 +1,208 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import ContentIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] def compute_language_from_chunk(encoding, length, raw_content, max_size, log=None): """Determine the raw content's language. Args: encoding (str): Encoding to use to decode the content length (int): raw_content's length raw_content (bytes): raw content to work with max_size (int): max size to split the raw content at Returns: Dict with keys: - lang: None if nothing found or the possible language """ try: if max_size <= length: raw_content = raw_content[0:max_size] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except UnicodeDecodeError: raise except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } def compute_language(raw_content, encoding=None, log=None): """Determine the raw content's language. Args: raw_content (bytes): raw content to work with Returns: Dict with keys: - lang: None if nothing found or the possible language """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } class ContentLanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } def prepare(self): super().prepare() c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_language_missing(( + yield from self.idx_storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'] } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'lang': None, } encoding = _detect_encoding(data) if not encoding: return result l = len(data) for i in range(0, 9): max_size = self.max_content_size + i try: result = compute_language_from_chunk( encoding, l, data, max_size, log=self.log) except UnicodeDecodeError: self.log.warn('Decoding failed on wrong byte chunk at [0-%s]' ', trying again at next ending byte.' % max_size) continue # we found something, so we return it result.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.content_language_add( + self.idx_storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py index f40c0e4..9bded05 100644 --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -1,293 +1,299 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import logging from swh.indexer.indexer import ContentIndexer, RevisionIndexer from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.model import hashutil class ContentMetadataIndexer(ContentIndexer): """Content-level indexer This indexer is in charge of: - filtering out content already indexed in content_metadata - reading content from objstorage with the content's id sha1 - computing translated_metadata by given context - using the metadata_dictionary as the 'swh-metadata-translator' tool - store result in content_metadata table """ CONFIG_BASE_FILENAME = 'indexer/metadata' def __init__(self, tool, config): # twisted way to use the exact same config of RevisionMetadataIndexer # object that uses internally ContentMetadataIndexer self.config = config self.config['tools'] = tool super().__init__() def prepare(self): self.results = [] - if self.config['storage']: - self.storage = self.config['storage'] + if self.config['indexer_storage']: + self.idx_storage = self.config['indexer_storage'] if self.config['objstorage']: self.objstorage = self.config['objstorage'] l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) self.log = logging.getLogger('swh.indexer') self.tools = self.register_tools(self.config['tools']) # NOTE: only one tool so far, change when no longer true self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_metadata_missing(( + yield from self.idx_storage.content_metadata_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: dictionary representing a content_metadata. If the translation wasn't successful the translated_metadata keys will be returned as None """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } try: context = self.tool['tool_configuration']['context'] result['translated_metadata'] = compute_metadata(context, data) # a twisted way to keep result with indexer object for get_results self.results.append(result) except: self.log.exception( "Problem during tool retrieval of metadata translation") return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_metadata, dict with the following keys: - id (bytes): content's identifier (sha1) - translated_metadata (jsonb): detected metadata policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.content_metadata_add( + self.idx_storage.content_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def get_results(self): """can be called only if run method was called before Returns: list: list of content_metadata entries calculated by current indexer """ return self.results class RevisionMetadataIndexer(RevisionIndexer): """Revision-level indexer This indexer is in charge of: - filtering revisions already indexed in revision_metadata table with defined computation tool - retrieve all entry_files in root directory - use metadata_detector for file_names containig metadata - compute metadata translation if necessary and possible (depends on tool) - send sha1s to content indexing if possible - store the results for revision """ CONFIG_BASE_FILENAME = 'indexer/metadata' ADDITIONAL_CONFIG = { + 'storage': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:5002/', + } + }), 'tools': ('dict', { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': ['npm', 'codemeta'] }, }), } def prepare(self): super().prepare() self.tool = self.tools[0] def filter(self, sha1_gits): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.revision_metadata_missing(( + yield from self.idx_storage.revision_metadata_missing(( { 'id': sha1_git, 'indexer_configuration_id': self.tool['id'], } for sha1_git in sha1_gits )) def index(self, rev): """Index rev by processing it and organizing result. use metadata_detector to iterate on filenames - if one filename detected -> sends file to content indexer - if multiple file detected -> translation needed at revision level Args: rev (bytes): revision artifact from storage Returns: dict: dictionary representing a revision_metadata, with keys: - id (bytes): rev's identifier (sha1_git) - indexer_configuration_id (bytes): tool used - translated_metadata (bytes): dict of retrieved metadata """ try: result = { 'id': rev['id'], 'indexer_configuration_id': self.tool['id'], 'translated_metadata': None } root_dir = rev['directory'] dir_ls = self.storage.directory_ls(root_dir, recursive=False) files = (entry for entry in dir_ls if entry['type'] == 'file') detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( detected_files) except Exception as e: self.log.exception( 'Problem when indexing rev') return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ # TODO: add functions in storage to keep data in revision_metadata - self.storage.revision_metadata_add( + self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) def translate_revision_metadata(self, detected_files): """ Determine plan of action to translate metadata when containing one or multiple detected files: Args: detected_files (dict): dictionary mapping context names (e.g., "npm", "authors") to list of sha1 Returns: dict: dict with translated metadata according to the CodeMeta vocabulary """ translated_metadata = [] tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': None }, } # TODO: iterate on each context, on each file # -> get raw_contents # -> translate each content config = { - 'storage': self.storage, + 'indexer_storage': self.idx_storage, 'objstorage': self.objstorage } for context in detected_files.keys(): tool['configuration']['context'] = context c_metadata_indexer = ContentMetadataIndexer(tool, config) # sha1s that are in content_metadata table sha1s_in_storage = [] - metadata_generator = self.storage.content_metadata_get( - detected_files[context]) + metadata_generator = self.idx_storage.content_metadata_get( + detected_files[context]) for c in metadata_generator: # extracting translated_metadata sha1 = c['id'] sha1s_in_storage.append(sha1) local_metadata = c['translated_metadata'] # local metadata is aggregated if local_metadata: translated_metadata.append(local_metadata) sha1s_filtered = [item for item in detected_files[context] if item not in sha1s_in_storage] if sha1s_filtered: # schedule indexation of content try: c_metadata_indexer.run(sha1s_filtered, policy_update='ignore-dups') # on the fly possibility: results = c_metadata_indexer.get_results() for result in results: local_metadata = result['translated_metadata'] translated_metadata.append(local_metadata) except Exception as e: self.log.warn("""Exception while indexing content""", e) # transform translated_metadata into min set with swh-metadata-detector min_metadata = extract_minimal_metadata_dict(translated_metadata) return min_metadata @click.command() @click.option('--revs', '-i', default=['8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', '026040ea79dec1b49b4e3e7beda9132b6b26b51b', '9699072e21eded4be8d45e3b8d543952533fa190'], help='Default sha1_git to lookup', multiple=True) def main(revs): _git_sha1s = list(map(hashutil.hash_to_bytes, revs)) rev_metadata_indexer = RevisionMetadataIndexer() rev_metadata_indexer.run(_git_sha1s, 'update-dups') if __name__ == '__main__': logging.basicConfig(level=logging.INFO) main() diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index 5e2ee14..56a0e54 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -1,211 +1,210 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json def convert(raw_content): """ convert raw_content recursively: - from bytes to string - from string to dict Args: raw_content (bytes / string / dict) Returns: dict: content (if string was json, otherwise returns string) """ if isinstance(raw_content, bytes): return convert(raw_content.decode()) if isinstance(raw_content, str): try: content = json.loads(raw_content) if content: return content else: return raw_content except json.decoder.JSONDecodeError: return raw_content if isinstance(raw_content, dict): return raw_content class BaseMapping(): """Base class for mappings to inherit from To implement a new mapping: - inherit this class - add a local property self.mapping - override translate function """ def translate(self, content_dict): """ Tranlsates content by parsing content to a json object and translating with the npm mapping (for now hard_coded mapping) Args: context_text (text): should be json Returns: dict: translated metadata in jsonb form needed for the indexer """ translated_metadata = {} default = 'other' translated_metadata['other'] = {} try: for k, v in content_dict.items(): try: term = self.mapping.get(k, default) if term not in translated_metadata: translated_metadata[term] = v continue if isinstance(translated_metadata[term], str): in_value = translated_metadata[term] translated_metadata[term] = [in_value, v] continue if isinstance(translated_metadata[term], list): translated_metadata[term].append(v) continue if isinstance(translated_metadata[term], dict): translated_metadata[term][k] = v continue except KeyError: self.log.exception( "Problem during item mapping") continue except: return None return translated_metadata class NpmMapping(BaseMapping): """ dedicated class for NPM (package.json) mapping and translation """ mapping = { 'repository': 'codeRepository', 'os': 'operatingSystem', 'cpu': 'processorRequirements', 'engines': 'processorRequirements', 'dependencies': 'softwareRequirements', 'bundleDependencies': 'softwareRequirements', 'peerDependencies': 'softwareRequirements', 'author': 'author', 'contributor': 'contributor', 'keywords': 'keywords', 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'devDependencies': 'softwareSuggestions', 'optionalDependencies': 'softwareSuggestions', 'bugs': 'issueTracker', 'homepage': 'url' } def translate(self, raw_content): content_dict = convert(raw_content) return super().translate(content_dict) class MavenMapping(BaseMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ mapping = { 'license': 'license', 'version': 'version', 'description': 'description', 'name': 'name', 'prerequisites': 'softwareRequirements', 'repositories': 'codeRepository', 'groupId': 'identifier', 'ciManagement': 'contIntegration', 'issuesManagement': 'issueTracker', } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) class DoapMapping(BaseMapping): mapping = { } def translate(self, raw_content): content = convert(raw_content) # parse content from xml to dict return super().translate(content) def parse_xml(content): """ Parses content from xml to a python dict Args: - content (text): the string form of the raw_content ( in xml) Returns: - parsed_xml (dict): a python dict of the content after parsing """ # check if xml # use xml parser to dict return content mapping_tool_fn = { "npm": NpmMapping(), "maven": MavenMapping(), "doap_xml": DoapMapping() } def compute_metadata(context, raw_content): """ first landing method: a dispatcher that sends content to the right function to carry out the real parsing of syntax and translation of terms Args: context (text): defines to which function/tool the content is sent content (text): the string form of the raw_content Returns: dict: translated metadata jsonb dictionary needed for the indexer to store in storage """ if raw_content is None or raw_content is b"": return None # TODO: keep mapping not in code (maybe fetch crosswalk from storage?) # if fetched from storage should be done once for batch of sha1s dictionary = mapping_tool_fn[context] translated_metadata = dictionary.translate(raw_content) - # print(translated_metadata) return translated_metadata def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", "unknown_term": "ut", "prerequisites" :"packageXYZ"}""" result = compute_metadata("npm", raw_content) result1 = compute_metadata("maven", raw_content1) print(result) print(result1) if __name__ == "__main__": main() diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 8bbbf64..57bcd3a 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,158 +1,158 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import magic from swh.model import hashutil from swh.scheduler import utils from .indexer import ContentIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: A dict with mimetype and encoding key and corresponding values (as bytes). """ r = magic.detect_from_content(raw_content) return { 'mimetype': r.mime_type.encode('utf-8'), 'encoding': r.encoding.encode('utf-8'), } class ContentMimetypeIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { 'destination_queue': ('str', None), 'tools': ('dict', { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() destination_queue = self.config.get('destination_queue') if destination_queue: self.task_destination = utils.get_task(destination_queue) else: self.task_destination = None self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_mimetype_missing(( + yield from self.idx_storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ try: properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) except TypeError: self.log.error('Detecting mimetype error for id %s' % ( hashutil.hash_to_hex(id), )) return None return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ - self.storage.content_mimetype_add( + self.idx_storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) def _filter_text(self, results): """Filter sha1 whose raw content is text. """ for result in results: if b'binary' in result['encoding']: continue yield result['id'] def next_step(self, results): """When the computations is done, we'd like to send over only text contents to the text content orchestrator. Args: results ([dict]): List of content_mimetype results, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ if self.task_destination: self.task_destination.delay(list(self._filter_text(results))) @click.command() @click.option('--path', help="Path to execute index on") def main(path): with open(path, 'rb') as f: raw_content = f.read() print(compute_mimetype_encoding(raw_content)) if __name__ == '__main__': main() diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py new file mode 100644 index 0000000..8325954 --- /dev/null +++ b/swh/indexer/storage/__init__.py @@ -0,0 +1,521 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import json +import dateutil.parser +import psycopg2 + +from swh.storage.common import db_transaction_generator, db_transaction +from swh.storage.exc import StorageDBError +from .db import Db + +from . import converters + + +class IndexerStorage(): + """SWH Indexer Storage + + """ + def __init__(self, db): + """ + Args: + db_conn: either a libpq connection string, or a psycopg2 connection + obj_root: path to the root of the object storage + + """ + try: + if isinstance(db, psycopg2.extensions.connection): + self.db = Db(db) + else: + self.db = Db.connect(db) + except psycopg2.OperationalError as e: + raise StorageDBError(e) + + def check_config(self, *, check_write): + """Check that the storage is configured and ready to go.""" + # Check permissions on one of the tables + with self.db.transaction() as cur: + if check_write: + check = 'INSERT' + else: + check = 'SELECT' + + cur.execute( + "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa + (check,) + ) + return cur.fetchone()[0] + + return True + + @db_transaction_generator + def content_mimetype_missing(self, mimetypes, cur=None): + """List mimetypes missing from storage. + + Args: + mimetypes (iterable): iterable of dict with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: an iterable of missing id for the triplets id, tool_name, + tool_version + + """ + db = self.db + db.mktemp_content_mimetype_missing(cur) + db.copy_to(mimetypes, 'tmp_content_mimetype_missing', + ['id', 'indexer_configuration_id'], + cur) + for obj in db.content_mimetype_missing_from_temp(cur): + yield obj[0] + + @db_transaction + def content_mimetype_add(self, mimetypes, conflict_update=False, cur=None): + """Add mimetypes not present in storage. + + Args: + mimetypes (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - mimetype (bytes): raw content's mimetype + - encoding (bytes): raw content's encoding + - indexer_configuration_id (int): tool's id used to + compute the results + - conflict_update: Flag to determine if we want to + overwrite (true) or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_mimetype(cur) + db.copy_to(mimetypes, 'tmp_content_mimetype', + ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], + cur) + db.content_mimetype_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_mimetype_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_mimetype_get_from_temp(): + yield converters.db_to_mimetype( + dict(zip(db.content_mimetype_cols, c))) + + @db_transaction_generator + def content_language_missing(self, languages, cur=None): + """List languages missing from storage. + + Args: + languages (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: identifiers of missing languages + + """ + db = self.db + db.mktemp_content_language_missing(cur) + db.copy_to(languages, 'tmp_content_language_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_language_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_language_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_language_get_from_temp(): + yield converters.db_to_language( + dict(zip(db.content_language_cols, c))) + + @db_transaction + def content_language_add(self, languages, conflict_update=False, cur=None): + """Add languages not present in storage. + + Args: + languages (iterable): dictionaries with keys: + + - id: sha1 + - lang: bytes + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_language(cur) + # empty language is mapped to 'unknown' + db.copy_to( + ({ + 'id': l['id'], + 'lang': 'unknown' if not l['lang'] else l['lang'], + 'indexer_configuration_id': l['indexer_configuration_id'], + } for l in languages), + 'tmp_content_language', + ['id', 'lang', 'indexer_configuration_id'], cur) + + db.content_language_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_ctags_missing(self, ctags, cur=None): + """List ctags missing from storage. + + Args: + ctags (iterable): dicts with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool name used + - tool_version (str): associated version + + Returns: + an iterable of missing id + + """ + db = self.db + + db.mktemp_content_ctags_missing(cur) + db.copy_to(ctags, + tblname='tmp_content_ctags_missing', + columns=['id', 'indexer_configuration_id'], + cur=cur) + for obj in db.content_ctags_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_ctags_get(self, ids, cur=None): + """Retrieve ctags per id. + + Args: + ids (iterable): sha1 checksums + + """ + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_ctags_get_from_temp(): + yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + + @db_transaction + def content_ctags_add(self, ctags, conflict_update=False, cur=None): + """Add ctags not present in storage + + Args: + ctags (iterable): dictionaries with keys: + + - id (bytes): sha1 + - ctags ([list): List of dictionary with keys: name, kind, + line, language + + """ + db = self.db + + def _convert_ctags(__ctags): + """Convert ctags dict to list of ctags. + + """ + for ctags in __ctags: + yield from converters.ctags_to_db(ctags) + + db.mktemp_content_ctags(cur) + db.copy_to(list(_convert_ctags(ctags)), + tblname='tmp_content_ctags', + columns=['id', 'name', 'kind', 'line', + 'lang', 'indexer_configuration_id'], + cur=cur) + + db.content_ctags_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_ctags_search(self, expression, + limit=10, last_sha1=None, cur=None): + """Search through content's raw ctags symbols. + + Args: + expression (str): Expression to search for + limit (int): Number of rows to return (default to 10). + last_sha1 (str): Offset from which retrieving data (default to ''). + + Yields: + rows of ctags including id, name, lang, kind, line, etc... + + """ + db = self.db + + for obj in db.content_ctags_search(expression, last_sha1, limit, + cur=cur): + yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + + @db_transaction_generator + def content_fossology_license_get(self, ids, cur=None): + """Retrieve licenses per id. + + Args: + ids (iterable): sha1 checksums + + Yields: + list: dictionaries with the following keys: + + - id (bytes) + - licenses ([str]): associated licenses for that content + + """ + db = self.db + db.store_tmp_bytea(ids, cur) + + for c in db.content_fossology_license_get_from_temp(): + license = dict(zip(db.content_fossology_license_cols, c)) + yield converters.db_to_fossology_license(license) + + @db_transaction + def content_fossology_license_add(self, licenses, + conflict_update=False, cur=None): + """Add licenses not present in storage. + + Args: + licenses (iterable): dictionaries with keys: + + - id: sha1 + - license ([bytes]): List of licenses associated to sha1 + - tool (str): nomossa + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + Returns: + list: content_license entries which failed due to unknown licenses + + """ + db = self.db + + # Then, we add the correct ones + db.mktemp_content_fossology_license(cur) + db.copy_to( + ({ + 'id': sha1['id'], + 'indexer_configuration_id': sha1['indexer_configuration_id'], + 'license': license, + } for sha1 in licenses + for license in sha1['licenses']), + tblname='tmp_content_fossology_license', + columns=['id', 'license', 'indexer_configuration_id'], + cur=cur) + db.content_fossology_license_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def content_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id (bytes): sha1 identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: missing ids + + """ + db = self.db + db.mktemp_content_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_content_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.content_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def content_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.content_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.content_metadata_cols, c))) + + @db_transaction + def content_metadata_add(self, metadatas, conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id: sha1 + - translated_metadata: bytes / jsonb ? + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_content_metadata(cur) + # empty metadata is mapped to 'unknown' + + db.copy_to(metadatas, 'tmp_content_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], + cur) + db.content_metadata_add_from_temp(conflict_update, cur) + + @db_transaction_generator + def revision_metadata_missing(self, metadatas, cur=None): + """List metadatas missing from storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id (bytes): sha1_git revision identifier + - tool_name (str): tool used to compute the results + - tool_version (str): associated tool's version + + Returns: + iterable: missing ids + + """ + db = self.db + db.mktemp_revision_metadata_missing(cur) + db.copy_to(metadatas, 'tmp_revision_metadata_missing', + ['id', 'indexer_configuration_id'], cur) + for obj in db.revision_metadata_missing_from_temp(cur): + yield obj[0] + + @db_transaction_generator + def revision_metadata_get(self, ids, cur=None): + db = self.db + db.store_tmp_bytea(ids, cur) + for c in db.revision_metadata_get_from_temp(): + yield converters.db_to_metadata( + dict(zip(db.revision_metadata_cols, c))) + + @db_transaction + def revision_metadata_add(self, metadatas, + conflict_update=False, cur=None): + """Add metadatas not present in storage. + + Args: + metadatas (iterable): dictionaries with keys: + + - id: sha1_git of revision + - translated_metadata: bytes / jsonb ? + + conflict_update: Flag to determine if we want to overwrite (true) + or skip duplicates (false, the default) + + """ + db = self.db + db.mktemp_revision_metadata(cur) + # empty metadata is mapped to 'unknown' + + db.copy_to(metadatas, 'tmp_revision_metadata', + ['id', 'translated_metadata', 'indexer_configuration_id'], + cur) + db.revision_metadata_add_from_temp(conflict_update, cur) + + @db_transaction + def origin_metadata_add(self, origin_id, ts, provider, tool, metadata, + cur=None): + """ Add an origin_metadata for the origin at ts with provenance and + metadata. + + Args: + origin_id (int): the origin's id for which the metadata is added + ts (datetime): timestamp of the found metadata + provider (int): the provider of metadata (ex:'hal') + tool (int): tool used to extract metadata + metadata (jsonb): the metadata retrieved at the time and location + + Returns: + id (int): the origin_metadata unique id + """ + if isinstance(ts, str): + ts = dateutil.parser.parse(ts) + + return self.db.origin_metadata_add(origin_id, ts, provider, tool, + metadata, cur) + + @db_transaction_generator + def origin_metadata_get_by(self, origin_id, provider_type=None, cur=None): + """Retrieve list of all origin_metadata entries for the origin_id + + Args: + origin_id (int): the unique origin identifier + provider_type (str): (optional) type of provider + + Returns: + list of dicts: the origin_metadata dictionary with the keys: + + - id (int): origin_metadata's id + - origin_id (int): origin's id + - discovery_date (datetime): timestamp of discovery + - tool_id (int): metadata's extracting tool + - metadata (jsonb) + - provider_id (int): metadata's provider + - provider_name (str) + - provider_type (str) + - provider_url (str) + + """ + db = self.db + for line in db.origin_metadata_get_by(origin_id, provider_type, cur): + yield dict(zip(db.origin_metadata_get_cols, line)) + + @db_transaction_generator + def indexer_configuration_add(self, tools, cur=None): + """Add new tools to the storage. + + Args: + tools ([dict]): List of dictionary representing tool to + insert in the db. Dictionary with the following keys:: + + tool_name (str): tool's name + tool_version (str): tool's version + tool_configuration (dict): tool's configuration (free form + dict) + + Returns: + List of dict inserted in the db (holding the id key as + well). The order of the list is not guaranteed to match + the order of the initial list. + + """ + db = self.db + db.mktemp_indexer_configuration(cur) + db.copy_to(tools, 'tmp_indexer_configuration', + ['tool_name', 'tool_version', 'tool_configuration'], + cur) + + tools = db.indexer_configuration_add_from_temp(cur) + for line in tools: + yield dict(zip(db.indexer_configuration_cols, line)) + + @db_transaction + def indexer_configuration_get(self, tool, cur=None): + """Retrieve tool information. + + Args: + tool (dict): Dictionary representing a tool with the + following keys:: + + tool_name (str): tool's name + tool_version (str): tool's version + tool_configuration (dict): tool's configuration (free form + dict) + + Returns: + The identifier of the tool if it exists, None otherwise. + + """ + db = self.db + tool_conf = tool['tool_configuration'] + if isinstance(tool_conf, dict): + tool_conf = json.dumps(tool_conf) + idx = db.indexer_configuration_get(tool['tool_name'], + tool['tool_version'], + tool_conf) + if not idx: + return None + return dict(zip(self.db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/__init__.py b/swh/indexer/storage/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py new file mode 100644 index 0000000..e6a87a9 --- /dev/null +++ b/swh/indexer/storage/api/client.py @@ -0,0 +1,100 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.core.api import SWHRemoteAPI + +from swh.storage.exc import StorageAPIError + + +class RemoteStorage(SWHRemoteAPI): + """Proxy to a remote storage API""" + def __init__(self, url): + super().__init__(api_exception=StorageAPIError, url=url) + + def check_config(self, *, check_write): + return self.post('check_config', {'check_write': check_write}) + + def content_mimetype_add(self, mimetypes, conflict_update=False): + return self.post('content_mimetype/add', { + 'mimetypes': mimetypes, + 'conflict_update': conflict_update, + }) + + def content_mimetype_missing(self, mimetypes): + return self.post('content_mimetype/missing', {'mimetypes': mimetypes}) + + def content_mimetype_get(self, ids): + return self.post('content_mimetype', {'ids': ids}) + + def content_language_add(self, languages, conflict_update=False): + return self.post('content_language/add', { + 'languages': languages, + 'conflict_update': conflict_update, + }) + + def content_language_missing(self, languages): + return self.post('content_language/missing', {'languages': languages}) + + def content_language_get(self, ids): + return self.post('content_language', {'ids': ids}) + + def content_ctags_add(self, ctags, conflict_update=False): + return self.post('content/ctags/add', { + 'ctags': ctags, + 'conflict_update': conflict_update, + }) + + def content_ctags_missing(self, ctags): + return self.post('content/ctags/missing', {'ctags': ctags}) + + def content_ctags_get(self, ids): + return self.post('content/ctags', {'ids': ids}) + + def content_ctags_search(self, expression, limit=10, last_sha1=None): + return self.post('content/ctags/search', { + 'expression': expression, + 'limit': limit, + 'last_sha1': last_sha1, + }) + + def content_fossology_license_add(self, licenses, conflict_update=False): + return self.post('content/fossology_license/add', { + 'licenses': licenses, + 'conflict_update': conflict_update, + }) + + def content_fossology_license_get(self, ids): + return self.post('content/fossology_license', {'ids': ids}) + + def content_metadata_add(self, metadatas, conflict_update=False): + return self.post('content_metadata/add', { + 'metadatas': metadatas, + 'conflict_update': conflict_update, + }) + + def content_metadata_missing(self, metadatas): + return self.post('content_metadata/missing', {'metadatas': metadatas}) + + def content_metadata_get(self, ids): + return self.post('content_metadata', {'ids': ids}) + + def revision_metadata_add(self, metadatas, conflict_update=False): + return self.post('revision_metadata/add', { + 'metadatas': metadatas, + 'conflict_update': conflict_update, + }) + + def revision_metadata_missing(self, metadatas): + return self.post('revision_metadata/missing', {'metadatas': metadatas}) + + def revision_metadata_get(self, ids): + return self.post('revision_metadata', {'ids': ids}) + + def indexer_configuration_add(self, tools): + return self.post('indexer_configuration/add', {'tools': tools}) + + def indexer_configuration_get(self, tool): + return self.post('indexer_configuration/data', {'tool': tool}) diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py new file mode 100644 index 0000000..5ebacf2 --- /dev/null +++ b/swh/indexer/storage/api/server.py @@ -0,0 +1,197 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +import click + +from flask import g, request + +from swh.core import config +from swh.core.api import (SWHServerAPIApp, decode_request, + error_handler, + encode_data_server as encode_data) +from swh.indexer import get_indexer_storage + + +DEFAULT_CONFIG = { + 'storage': ('dict', { + 'cls': 'local', + 'args': { + 'db': 'dbname=softwareheritage-indexer-dev', + }, + }) +} + + +app = SWHServerAPIApp(__name__) + + +@app.errorhandler(Exception) +def my_error_handler(exception): + return error_handler(exception, encode_data) + + +@app.before_request +def before_request(): + g.storage = get_indexer_storage(**app.config['storage']) + + +@app.route('/') +def index(): + return 'SWH Indexer Storage API server' + + +@app.route('/check_config', methods=['POST']) +def check_config(): + return encode_data(g.storage.check_config(**decode_request(request))) + + +@app.route('/content_mimetype/add', methods=['POST']) +def content_mimetype_add(): + return encode_data( + g.storage.content_mimetype_add(**decode_request(request))) + + +@app.route('/content_mimetype/missing', methods=['POST']) +def content_mimetype_missing(): + return encode_data( + g.storage.content_mimetype_missing(**decode_request(request))) + + +@app.route('/content_mimetype', methods=['POST']) +def content_mimetype_get(): + return encode_data( + g.storage.content_mimetype_get(**decode_request(request))) + + +@app.route('/content_language/add', methods=['POST']) +def content_language_add(): + return encode_data( + g.storage.content_language_add(**decode_request(request))) + + +@app.route('/content_language/missing', methods=['POST']) +def content_language_missing(): + return encode_data( + g.storage.content_language_missing(**decode_request(request))) + + +@app.route('/content_language', methods=['POST']) +def content_language_get(): + return encode_data( + g.storage.content_language_get(**decode_request(request))) + + +@app.route('/content/ctags/add', methods=['POST']) +def content_ctags_add(): + return encode_data( + g.storage.content_ctags_add(**decode_request(request))) + + +@app.route('/content/ctags/search', methods=['POST']) +def content_ctags_search(): + return encode_data( + g.storage.content_ctags_search(**decode_request(request))) + + +@app.route('/content/ctags/missing', methods=['POST']) +def content_ctags_missing(): + return encode_data( + g.storage.content_ctags_missing(**decode_request(request))) + + +@app.route('/content/ctags', methods=['POST']) +def content_ctags_get(): + return encode_data( + g.storage.content_ctags_get(**decode_request(request))) + + +@app.route('/content/fossology_license/add', methods=['POST']) +def content_fossology_license_add(): + return encode_data( + g.storage.content_fossology_license_add(**decode_request(request))) + + +@app.route('/content/fossology_license', methods=['POST']) +def content_fossology_license_get(): + return encode_data( + g.storage.content_fossology_license_get(**decode_request(request))) + + +@app.route('/indexer_configuration/data', methods=['POST']) +def indexer_configuration_get(): + return encode_data(g.storage.indexer_configuration_get( + **decode_request(request))) + + +@app.route('/indexer_configuration/add', methods=['POST']) +def indexer_configuration_add(): + return encode_data(g.storage.indexer_configuration_add( + **decode_request(request))) + + +@app.route('/content_metadata/add', methods=['POST']) +def content_metadata_add(): + return encode_data( + g.storage.content_metadata_add(**decode_request(request))) + + +@app.route('/content_metadata/missing', methods=['POST']) +def content_metadata_missing(): + return encode_data( + g.storage.content_metadata_missing(**decode_request(request))) + + +@app.route('/content_metadata', methods=['POST']) +def content_metadata_get(): + return encode_data( + g.storage.content_metadata_get(**decode_request(request))) + + +@app.route('/revision_metadata/add', methods=['POST']) +def revision_metadata_add(): + return encode_data( + g.storage.revision_metadata_add(**decode_request(request))) + + +@app.route('/revision_metadata/missing', methods=['POST']) +def revision_metadata_missing(): + return encode_data( + g.storage.revision_metadata_missing(**decode_request(request))) + + +@app.route('/revision_metadata', methods=['POST']) +def revision_metadata_get(): + return encode_data( + g.storage.revision_metadata_get(**decode_request(request))) + + +def run_from_webserver(environ, start_response): + """Run the WSGI app from the webserver, loading the configuration.""" + + config_path = '/etc/softwareheritage/indexer/storage.yml' + + app.config.update(config.read(config_path, DEFAULT_CONFIG)) + + handler = logging.StreamHandler() + app.logger.addHandler(handler) + + return app(environ, start_response) + + +@click.command() +@click.argument('config-path', required=1) +@click.option('--host', default='0.0.0.0', help="Host to run the server") +@click.option('--port', default=5007, type=click.INT, + help="Binding port of the server") +@click.option('--debug/--nodebug', default=True, + help="Indicates if the server should run in debug mode") +def launch(config_path, host, port, debug): + app.config.update(config.read(config_path, DEFAULT_CONFIG)) + app.run(host, port=int(port), debug=bool(debug)) + + +if __name__ == '__main__': + launch() diff --git a/swh/indexer/storage/converters.py b/swh/indexer/storage/converters.py new file mode 100644 index 0000000..db7a295 --- /dev/null +++ b/swh/indexer/storage/converters.py @@ -0,0 +1,140 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def ctags_to_db(ctags): + """Convert a ctags entry into a ready ctags entry. + + Args: + ctags (dict): ctags entry with the following keys: + + - id (bytes): content's identifier + - tool_id (int): tool id used to compute ctags + - ctags ([dict]): List of dictionary with the following keys: + + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language + + Returns: + list: list of ctags entries as dicts with the following keys: + + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool_id (int): tool id used to compute ctags + + """ + id = ctags['id'] + tool_id = ctags['indexer_configuration_id'] + for ctag in ctags['ctags']: + yield { + 'id': id, + 'name': ctag['name'], + 'kind': ctag['kind'], + 'line': ctag['line'], + 'lang': ctag['lang'], + 'indexer_configuration_id': tool_id, + } + + +def db_to_ctags(ctag): + """Convert a ctags entry into a ready ctags entry. + + Args: + ctags (dict): ctags entry with the following keys: + - id (bytes): content's identifier + - ctags ([dict]): List of dictionary with the following keys: + - name (str): symbol's name + - kind (str): symbol's kind + - line (int): symbol's line in the content + - language (str): language + + Returns: + List of ctags ready entry (dict with the following keys): + - id (bytes): content's identifier + - name (str): symbol's name + - kind (str): symbol's kind + - language (str): language for that content + - tool (dict): tool used to compute the ctags + + """ + return { + 'id': ctag['id'], + 'name': ctag['name'], + 'kind': ctag['kind'], + 'line': ctag['line'], + 'lang': ctag['lang'], + 'tool': { + 'id': ctag['tool_id'], + 'name': ctag['tool_name'], + 'version': ctag['tool_version'], + 'configuration': ctag['tool_configuration'] + } + } + + +def db_to_mimetype(mimetype): + """Convert a ctags entry into a ready ctags output. + + """ + return { + 'id': mimetype['id'], + 'encoding': mimetype['encoding'], + 'mimetype': mimetype['mimetype'], + 'tool': { + 'id': mimetype['tool_id'], + 'name': mimetype['tool_name'], + 'version': mimetype['tool_version'], + 'configuration': mimetype['tool_configuration'] + } + } + + +def db_to_language(language): + """Convert a language entry into a ready language output. + + """ + return { + 'id': language['id'], + 'lang': language['lang'], + 'tool': { + 'id': language['tool_id'], + 'name': language['tool_name'], + 'version': language['tool_version'], + 'configuration': language['tool_configuration'] + } + } + + +def db_to_metadata(metadata): + """Convert a metadata entry into a ready metadata output. + + """ + return { + 'id': metadata['id'], + 'translated_metadata': metadata['translated_metadata'], + 'tool': { + 'id': metadata['tool_id'], + 'name': metadata['tool_name'], + 'version': metadata['tool_version'], + 'configuration': metadata['tool_configuration'] + } + } + + +def db_to_fossology_license(license): + return { + 'id': license['id'], + 'licenses': license['licenses'], + 'tool': { + 'id': license['tool_id'], + 'name': license['tool_name'], + 'version': license['tool_version'], + 'configuration': license['tool_configuration'], + } + } diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py new file mode 100644 index 0000000..b51402e --- /dev/null +++ b/swh/indexer/storage/db.py @@ -0,0 +1,245 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.model import hashutil + +from swh.storage.db import BaseDb, stored_procedure, cursor_to_bytes +from swh.storage.db import line_to_bytes + + +class Db(BaseDb): + """Proxy to the SWH Indexer DB, with wrappers around stored procedures + + """ + @stored_procedure('swh_mktemp_bytea') + def mktemp_bytea(self, cur=None): pass + + def store_tmp_bytea(self, ids, cur=None): + """Store the given identifiers in a new tmp_bytea table""" + cur = self._cursor(cur) + + self.mktemp_bytea(cur) + self.copy_to(({'id': elem} for elem in ids), 'tmp_bytea', + ['id'], cur) + + content_mimetype_cols = [ + 'id', 'mimetype', 'encoding', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_mimetype_missing') + def mktemp_content_mimetype_missing(self, cur=None): pass + + def content_mimetype_missing_from_temp(self, cur=None): + """List missing mimetypes. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_mimetype_missing()") + yield from cursor_to_bytes(cur) + + @stored_procedure('swh_mktemp_content_mimetype') + def mktemp_content_mimetype(self, cur=None): pass + + def content_mimetype_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_mimetype_add(%s)", + (conflict_update, )) + + def content_mimetype_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_mimetype_get()" % ( + ','.join(self.content_mimetype_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_language_cols = [ + 'id', 'lang', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_language') + def mktemp_content_language(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_language_missing') + def mktemp_content_language_missing(self, cur=None): pass + + def content_language_missing_from_temp(self, cur=None): + """List missing languages. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_language_missing()") + yield from cursor_to_bytes(cur) + + def content_language_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_language_add(%s)", + (conflict_update, )) + + def content_language_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_language_get()" % ( + ','.join(self.content_language_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_ctags_cols = [ + 'id', 'name', 'kind', 'line', 'lang', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_ctags') + def mktemp_content_ctags(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_ctags_missing') + def mktemp_content_ctags_missing(self, cur=None): pass + + def content_ctags_missing_from_temp(self, cur=None): + """List missing ctags. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_ctags_missing()") + yield from cursor_to_bytes(cur) + + def content_ctags_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_ctags_add(%s)", + (conflict_update, )) + + def content_ctags_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_ctags_get()" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + def content_ctags_search(self, expression, last_sha1, limit, cur=None): + cur = self._cursor(cur) + if not last_sha1: + query = """SELECT %s + FROM swh_content_ctags_search(%%s, %%s)""" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query, (expression, limit)) + else: + if last_sha1 and isinstance(last_sha1, bytes): + last_sha1 = '\\x%s' % hashutil.hash_to_hex(last_sha1) + elif last_sha1: + last_sha1 = '\\x%s' % last_sha1 + + query = """SELECT %s + FROM swh_content_ctags_search(%%s, %%s, %%s)""" % ( + ','.join(self.content_ctags_cols)) + cur.execute(query, (expression, limit, last_sha1)) + + yield from cursor_to_bytes(cur) + + content_fossology_license_cols = [ + 'id', 'tool_id', 'tool_name', 'tool_version', 'tool_configuration', + 'licenses'] + + @stored_procedure('swh_mktemp_content_fossology_license') + def mktemp_content_fossology_license(self, cur=None): pass + + def content_fossology_license_add_from_temp(self, conflict_update, + cur=None): + """Add new licenses per content. + + """ + self._cursor(cur).execute( + "SELECT swh_content_fossology_license_add(%s)", + (conflict_update, )) + + def content_fossology_license_get_from_temp(self, cur=None): + """Retrieve licenses per content. + + """ + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_fossology_license_get()" % ( + ','.join(self.content_fossology_license_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + content_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_content_metadata') + def mktemp_content_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_content_metadata_missing') + def mktemp_content_metadata_missing(self, cur=None): pass + + def content_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_content_metadata_missing()") + yield from cursor_to_bytes(cur) + + def content_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_content_metadata_add(%s)", + (conflict_update, )) + + def content_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_content_metadata_get()" % ( + ','.join(self.content_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + revision_metadata_cols = [ + 'id', 'translated_metadata', + 'tool_id', 'tool_name', 'tool_version', 'tool_configuration'] + + @stored_procedure('swh_mktemp_revision_metadata') + def mktemp_revision_metadata(self, cur=None): pass + + @stored_procedure('swh_mktemp_revision_metadata_missing') + def mktemp_revision_metadata_missing(self, cur=None): pass + + def revision_metadata_missing_from_temp(self, cur=None): + """List missing metadatas. + + """ + cur = self._cursor(cur) + cur.execute("SELECT * FROM swh_revision_metadata_missing()") + yield from cursor_to_bytes(cur) + + def revision_metadata_add_from_temp(self, conflict_update, cur=None): + self._cursor(cur).execute("SELECT swh_revision_metadata_add(%s)", + (conflict_update, )) + + def revision_metadata_get_from_temp(self, cur=None): + cur = self._cursor(cur) + query = "SELECT %s FROM swh_revision_metadata_get()" % ( + ','.join(self.revision_metadata_cols)) + cur.execute(query) + yield from cursor_to_bytes(cur) + + indexer_configuration_cols = ['id', 'tool_name', 'tool_version', + 'tool_configuration'] + + @stored_procedure('swh_mktemp_indexer_configuration') + def mktemp_indexer_configuration(self, cur=None): + pass + + def indexer_configuration_add_from_temp(self, cur=None): + cur = self._cursor(cur) + cur.execute("SELECT %s from swh_indexer_configuration_add()" % ( + ','.join(self.indexer_configuration_cols), )) + yield from cursor_to_bytes(cur) + + def indexer_configuration_get(self, tool_name, + tool_version, tool_configuration, cur=None): + cur = self._cursor(cur) + cur.execute('''select %s + from indexer_configuration + where tool_name=%%s and + tool_version=%%s and + tool_configuration=%%s''' % ( + ','.join(self.indexer_configuration_cols)), + (tool_name, tool_version, tool_configuration)) + + data = cur.fetchone() + if not data: + return None + return line_to_bytes(data) diff --git a/swh/indexer/tests/__init__.py b/swh/indexer/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/indexer/tests/storage/test_api_client.py b/swh/indexer/tests/storage/test_api_client.py new file mode 100644 index 0000000..9e47975 --- /dev/null +++ b/swh/indexer/tests/storage/test_api_client.py @@ -0,0 +1,36 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from .test_storage import CommonTestStorage +from swh.storage.tests.server_testing import ServerTestFixture +from swh.indexer.storage.api.client import RemoteStorage +from swh.indexer.storage.api.server import app + + +class TestRemoteStorage(CommonTestStorage, ServerTestFixture, + unittest.TestCase): + """Test the indexer's remote storage API. + + This class doesn't define any tests as we want identical + functionality between local and remote storage. All the tests are + therefore defined in + `class`:swh.indexer.storage.test_storage.CommonTestStorage. + + """ + + def setUp(self): + self.config = { + 'storage': { + 'cls': 'local', + 'args': { + 'db': 'dbname=%s' % self.TEST_STORAGE_DB_NAME, + } + } + } + self.app = app + super().setUp() + self.storage = RemoteStorage(self.url()) diff --git a/swh/indexer/tests/storage/test_converters.py b/swh/indexer/tests/storage/test_converters.py new file mode 100644 index 0000000..89946d4 --- /dev/null +++ b/swh/indexer/tests/storage/test_converters.py @@ -0,0 +1,199 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest +from nose.plugins.attrib import attr + +from swh.indexer.storage import converters + + +@attr('!db') +class TestConverters(unittest.TestCase): + def setUp(self): + self.maxDiff = None + + @istest + def ctags_to_db(self): + input_ctag = { + 'id': b'some-id', + 'indexer_configuration_id': 100, + 'ctags': [ + { + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + }, { + 'name': 'main', + 'kind': 'function', + 'line': 12, + 'lang': 'Yaml', + }, + ] + } + + expected_ctags = [ + { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'indexer_configuration_id': 100, + }, { + 'id': b'some-id', + 'name': 'main', + 'kind': 'function', + 'line': 12, + 'lang': 'Yaml', + 'indexer_configuration_id': 100, + }] + + # when + actual_ctags = list(converters.ctags_to_db(input_ctag)) + + # then + self.assertEquals(actual_ctags, expected_ctags) + + @istest + def db_to_ctags(self): + input_ctags = { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'tool_id': 200, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {} + } + expected_ctags = { + 'id': b'some-id', + 'name': 'some-name', + 'kind': 'some-kind', + 'line': 10, + 'lang': 'Yaml', + 'tool': { + 'id': 200, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + # when + actual_ctags = converters.db_to_ctags(input_ctags) + + # then + self.assertEquals(actual_ctags, expected_ctags) + + @istest + def db_to_mimetype(self): + input_mimetype = { + 'id': b'some-id', + 'tool_id': 10, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'encoding': b'ascii', + 'mimetype': b'text/plain', + } + + expected_mimetype = { + 'id': b'some-id', + 'encoding': b'ascii', + 'mimetype': b'text/plain', + 'tool': { + 'id': 10, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_mimetype = converters.db_to_mimetype(input_mimetype) + + self.assertEquals(actual_mimetype, expected_mimetype) + + @istest + def db_to_language(self): + input_language = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'lang': b'css', + } + + expected_language = { + 'id': b'some-id', + 'lang': b'css', + 'tool': { + 'id': 20, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_language = converters.db_to_language(input_language) + + self.assertEquals(actual_language, expected_language) + + @istest + def db_to_fossology_license(self): + input_license = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'nomossa', + 'tool_version': '5.22', + 'tool_configuration': {}, + 'licenses': ['GPL2.0'], + } + + expected_license = { + 'id': b'some-id', + 'licenses': ['GPL2.0'], + 'tool': { + 'id': 20, + 'name': 'nomossa', + 'version': '5.22', + 'configuration': {}, + } + } + + actual_license = converters.db_to_fossology_license(input_license) + + self.assertEquals(actual_license, expected_license) + + @istest + def db_to_metadata(self): + input_metadata = { + 'id': b'some-id', + 'tool_id': 20, + 'tool_name': 'some-toolname', + 'tool_version': 'some-toolversion', + 'tool_configuration': {}, + 'translated_metadata': b'translated_metadata', + } + + expected_metadata = { + 'id': b'some-id', + 'translated_metadata': b'translated_metadata', + 'tool': { + 'id': 20, + 'name': 'some-toolname', + 'version': 'some-toolversion', + 'configuration': {}, + } + } + + actual_metadata = converters.db_to_metadata(input_metadata) + + self.assertEquals(actual_metadata, expected_metadata) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py new file mode 100644 index 0000000..65b77c8 --- /dev/null +++ b/swh/indexer/tests/storage/test_storage.py @@ -0,0 +1,1505 @@ +# Copyright (C) 2015-2017 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pathlib +import unittest + +from nose.tools import istest +from nose.plugins.attrib import attr +from swh.model.hashutil import hash_to_bytes + +from swh.indexer import get_indexer_storage +from swh.core.tests.db_testing import DbTestFixture + + +PATH_TO_STORAGE_TEST_DATA = '../../../../../swh-storage-testdata' + + +class StorageTestFixture: + """Mix this in a test subject class to get Storage testing support. + + This fixture requires to come before DbTestFixture in the inheritance list + as it uses its methods to setup its own internal database. + + Usage example: + + class TestStorage(StorageTestFixture, DbTestFixture): + ... + """ + TEST_STORAGE_DB_NAME = 'softwareheritage-test-indexer' + + @classmethod + def setUpClass(cls): + if not hasattr(cls, 'DB_TEST_FIXTURE_IMPORTED'): + raise RuntimeError("StorageTestFixture needs to be followed by " + "DbTestFixture in the inheritance list.") + + test_dir = pathlib.Path(__file__).absolute().parent + test_data_dir = test_dir / PATH_TO_STORAGE_TEST_DATA + test_db_dump = (test_data_dir / 'dumps/swh-indexer.dump').absolute() + cls.add_db(cls.TEST_STORAGE_DB_NAME, str(test_db_dump), 'pg_dump') + super().setUpClass() + + def setUp(self): + super().setUp() + + self.storage_config = { + 'cls': 'local', + 'args': { + 'db': self.test_db[self.TEST_STORAGE_DB_NAME].conn, + }, + } + self.storage = get_indexer_storage(**self.storage_config) + + def tearDown(self): + super().tearDown() + + def reset_storage_tables(self): + excluded = {'indexer_configuration'} + self.reset_db_tables(self.TEST_STORAGE_DB_NAME, excluded=excluded) + + db = self.test_db[self.TEST_STORAGE_DB_NAME] + db.conn.commit() + + +@attr('db') +class BaseTestStorage(StorageTestFixture, DbTestFixture): + def setUp(self): + super().setUp() + + db = self.test_db[self.TEST_STORAGE_DB_NAME] + self.conn = db.conn + self.cursor = db.cursor + + self.sha1_1 = hash_to_bytes('34973274ccef6ab4dfaaf86599792fa9c3fe4689') + self.sha1_2 = hash_to_bytes('61c2b3a30496d329e21af70dd2d7e097046d07b7') + self.revision_id_1 = hash_to_bytes( + '7026b7c1a2af56521e951c01ed20f255fa054238') + self.revision_id_2 = hash_to_bytes( + '7026b7c1a2af56521e9587659012345678904321') + + def tearDown(self): + self.reset_storage_tables() + super().tearDown() + + def fetch_tools(self): + tools = {} + self.cursor.execute(''' + select tool_name, id, tool_version, tool_configuration + from indexer_configuration + order by id''') + for row in self.cursor.fetchall(): + key = row[0] + while key in tools: + key = '_' + key + tools[key] = { + 'id': row[1], + 'name': row[0], + 'version': row[2], + 'configuration': row[3] + } + + return tools + + +@attr('db') +class CommonTestStorage(BaseTestStorage): + """Base class for Indexer Storage testing. + + """ + + @istest + def check_config(self): + self.assertTrue(self.storage.check_config(check_write=True)) + self.assertTrue(self.storage.check_config(check_write=False)) + + @istest + def content_mimetype_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetypes = [ + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }] + + # when + actual_missing = self.storage.content_mimetype_missing(mimetypes) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_1, + self.sha1_2, + ]) + + # given + self.storage.content_mimetype_add([{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + }]) + + # when + actual_missing = self.storage.content_mimetype_missing(mimetypes) + + # then + self.assertEqual(list(actual_missing), [self.sha1_1]) + + @istest + def content_mimetype_add__drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetype_v1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_mimetype_add([mimetype_v1]) + + # when + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + # then + expected_mimetypes_v1 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'], + }] + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + # given + mimetype_v2 = mimetype_v1.copy() + mimetype_v2.update({ + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + }) + + self.storage.content_mimetype_add([mimetype_v2]) + + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + # mimetype did not change as the v2 was dropped. + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + @istest + def content_mimetype_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetype_v1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_mimetype_add([mimetype_v1]) + + # when + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + expected_mimetypes_v1 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'], + }] + + # then + self.assertEqual(actual_mimetypes, expected_mimetypes_v1) + + # given + mimetype_v2 = mimetype_v1.copy() + mimetype_v2.update({ + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + }) + + self.storage.content_mimetype_add([mimetype_v2], conflict_update=True) + + actual_mimetypes = list(self.storage.content_mimetype_get( + [self.sha1_2])) + + expected_mimetypes_v2 = [{ + 'id': self.sha1_2, + 'mimetype': b'text/html', + 'encoding': b'us-ascii', + 'tool': { + 'id': 2, + 'name': 'file', + 'version': '5.22', + 'configuration': {'command_line': 'file --mime '} + } + }] + + # mimetype did change as the v2 was used to overwrite v1 + self.assertEqual(actual_mimetypes, expected_mimetypes_v2) + + @istest + def content_mimetype_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['file']['id'] + + mimetypes = [self.sha1_2, self.sha1_1] + + mimetype1 = { + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_mimetype_add([mimetype1]) + + # then + actual_mimetypes = list(self.storage.content_mimetype_get(mimetypes)) + + # then + expected_mimetypes = [{ + 'id': self.sha1_2, + 'mimetype': b'text/plain', + 'encoding': b'utf-8', + 'tool': tools['file'] + }] + + self.assertEqual(actual_mimetypes, expected_mimetypes) + + @istest + def content_language_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + languages = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.content_language_missing(languages)) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1, + ]) + + # given + self.storage.content_language_add([{ + 'id': self.sha1_2, + 'lang': 'haskell', + 'indexer_configuration_id': tool_id, + }]) + + # when + actual_missing = list(self.storage.content_language_missing(languages)) + + # then + self.assertEqual(actual_missing, [self.sha1_1]) + + @istest + def content_language_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language1 = { + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_language_add([language1]) + + # then + actual_languages = list(self.storage.content_language_get( + [self.sha1_2, self.sha1_1])) + + # then + expected_languages = [{ + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'tool': tools['pygments'] + }] + + self.assertEqual(actual_languages, expected_languages) + + @istest + def content_language_add__drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language_v1 = { + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_language_add([language_v1]) + + # when + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # then + expected_languages_v1 = [{ + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'tool': tools['pygments'] + }] + self.assertEqual(actual_languages, expected_languages_v1) + + # given + language_v2 = language_v1.copy() + language_v2.update({ + 'lang': 'common-lisp', + }) + + self.storage.content_language_add([language_v2]) + + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + self.assertEqual(actual_languages, expected_languages_v1) + + @istest + def content_language_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['pygments']['id'] + + language_v1 = { + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_language_add([language_v1]) + + # when + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # then + expected_languages_v1 = [{ + 'id': self.sha1_2, + 'lang': 'common-lisp', + 'tool': tools['pygments'] + }] + self.assertEqual(actual_languages, expected_languages_v1) + + # given + language_v2 = language_v1.copy() + language_v2.update({ + 'lang': 'emacslisp', + }) + + self.storage.content_language_add([language_v2], conflict_update=True) + + actual_languages = list(self.storage.content_language_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + expected_languages_v2 = [{ + 'id': self.sha1_2, + 'lang': 'emacslisp', + 'tool': tools['pygments'] + }] + + # language did change as the v2 was used to overwrite v1 + self.assertEqual(actual_languages, expected_languages_v2) + + @istest + def content_ctags_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['universal-ctags']['id'] + + ctags = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = self.storage.content_ctags_missing(ctags) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1 + ]) + + # given + self.storage.content_ctags_add([ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 119, + 'lang': 'OCaml', + }] + }, + ]) + + # when + actual_missing = self.storage.content_ctags_missing(ctags) + + # then + self.assertEqual(list(actual_missing), [self.sha1_1]) + + @istest + def content_ctags_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['universal-ctags']['id'] + + ctags = [self.sha1_2, self.sha1_1] + + ctag1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Python', + }, + { + 'name': 'main', + 'kind': 'function', + 'line': 119, + 'lang': 'Python', + }] + } + + # when + self.storage.content_ctags_add([ctag1]) + + # then + actual_ctags = list(self.storage.content_ctags_get(ctags)) + + # then + + expected_ctags = [ + { + 'id': self.sha1_2, + 'tool': tools['universal-ctags'], + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Python', + }, + { + 'id': self.sha1_2, + 'tool': tools['universal-ctags'], + 'name': 'main', + 'kind': 'function', + 'line': 119, + 'lang': 'Python', + } + ] + + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_ctags_search(self): + # 1. given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag1 = { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + }, + { + 'name': 'counter', + 'kind': 'variable', + 'line': 119, + 'lang': 'Python', + }, + ] + } + + ctag2 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [ + { + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + }, + ] + } + + self.storage.content_ctags_add([ctag1, ctag2]) + + # 1. when + actual_ctags = list(self.storage.content_ctags_search('hello', + limit=1)) + + # 1. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag1['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + } + ]) + + # 2. when + actual_ctags = list(self.storage.content_ctags_search( + 'hello', + limit=1, + last_sha1=ctag1['id'])) + + # 2. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag2['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + } + ]) + + # 3. when + actual_ctags = list(self.storage.content_ctags_search('hello')) + + # 3. then + self.assertEqual(actual_ctags, [ + { + 'id': ctag1['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'function', + 'line': 133, + 'lang': 'Python', + }, + { + 'id': ctag2['id'], + 'tool': tool, + 'name': 'hello', + 'kind': 'variable', + 'line': 100, + 'lang': 'C', + }, + ]) + + # 4. when + actual_ctags = list(self.storage.content_ctags_search('counter')) + + # then + self.assertEqual(actual_ctags, [{ + 'id': ctag1['id'], + 'tool': tool, + 'name': 'counter', + 'kind': 'variable', + 'line': 119, + 'lang': 'Python', + }]) + + @istest + def content_ctags_search_no_result(self): + actual_ctags = list(self.storage.content_ctags_search('counter')) + + self.assertEquals(actual_ctags, []) + + @istest + def content_ctags_add__add_new_ctags_added(self): + # given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag_v1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }] + } + + # given + self.storage.content_ctags_add([ctag_v1]) + self.storage.content_ctags_add([ctag_v1]) # conflict does nothing + + # when + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # then + expected_ctags = [{ + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }] + + self.assertEqual(actual_ctags, expected_ctags) + + # given + ctag_v2 = ctag_v1.copy() + ctag_v2.update({ + 'ctags': [ + { + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + } + ] + }) + + self.storage.content_ctags_add([ctag_v2]) + + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }, { + 'id': self.sha1_2, + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + 'tool': tool, + } + ] + + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_ctags_add__update_in_place(self): + # given + tools = self.fetch_tools() + tool = tools['universal-ctags'] + tool_id = tool['id'] + + ctag_v1 = { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + 'ctags': [{ + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }] + } + + # given + self.storage.content_ctags_add([ctag_v1]) + + # when + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # then + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool + } + ] + self.assertEqual(actual_ctags, expected_ctags) + + # given + ctag_v2 = ctag_v1.copy() + ctag_v2.update({ + 'ctags': [ + { + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + }, + { + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + } + ] + }) + + self.storage.content_ctags_add([ctag_v2], conflict_update=True) + + actual_ctags = list(self.storage.content_ctags_get( + [self.sha1_2])) + + # ctag did change as the v2 was used to overwrite v1 + expected_ctags = [ + { + 'id': self.sha1_2, + 'name': 'done', + 'kind': 'variable', + 'line': 100, + 'lang': 'Scheme', + 'tool': tool, + }, + { + 'id': self.sha1_2, + 'name': 'defn', + 'kind': 'function', + 'line': 120, + 'lang': 'Scheme', + 'tool': tool, + } + ] + self.assertEqual(actual_ctags, expected_ctags) + + @istest + def content_fossology_license_get(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license1 = { + 'id': self.sha1_1, + 'licenses': ['GPL-2.0+'], + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_fossology_license_add([license1]) + + # then + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_2, self.sha1_1])) + + expected_license = { + 'id': self.sha1_1, + 'licenses': ['GPL-2.0+'], + 'tool': tool, + } + + # then + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_fossology_license_add__new_license_added(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license_v1 = { + 'id': self.sha1_1, + 'licenses': ['Apache-2.0'], + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_fossology_license_add([license_v1]) + # conflict does nothing + self.storage.content_fossology_license_add([license_v1]) + + # when + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # then + expected_license = { + 'id': self.sha1_1, + 'licenses': ['Apache-2.0'], + 'tool': tool, + } + self.assertEqual(actual_licenses, [expected_license]) + + # given + license_v2 = license_v1.copy() + license_v2.update({ + 'licenses': ['BSD-2-Clause'], + }) + + self.storage.content_fossology_license_add([license_v2]) + + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + expected_license.update({ + 'licenses': ['Apache-2.0', 'BSD-2-Clause'], + }) + + # license did not change as the v2 was dropped. + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_fossology_license_add__update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool = tools['nomos'] + tool_id = tool['id'] + + license_v1 = { + 'id': self.sha1_1, + 'licenses': ['CECILL'], + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_fossology_license_add([license_v1]) + # conflict does nothing + self.storage.content_fossology_license_add([license_v1]) + + # when + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # then + expected_license = { + 'id': self.sha1_1, + 'licenses': ['CECILL'], + 'tool': tool, + } + self.assertEqual(actual_licenses, [expected_license]) + + # given + license_v2 = license_v1.copy() + license_v2.update({ + 'licenses': ['CECILL-2.0'] + }) + + self.storage.content_fossology_license_add([license_v2], + conflict_update=True) + + actual_licenses = list(self.storage.content_fossology_license_get( + [self.sha1_1])) + + # license did change as the v2 was used to overwrite v1 + expected_license.update({ + 'licenses': ['CECILL-2.0'] + }) + self.assertEqual(actual_licenses, [expected_license]) + + @istest + def content_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadatas = [ + { + 'id': self.sha1_2, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.sha1_1, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + self.sha1_2, + self.sha1_1, + ]) + + # given + self.storage.content_metadata_add([{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.content_metadata_missing(metadatas)) + + # then + self.assertEqual(actual_missing, [self.sha1_1]) + + @istest + def content_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # when + self.storage.content_metadata_add([metadata1]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2, self.sha1_1])) + + expected_metadatas = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'codeRepository': { + 'type': 'git', + 'url': 'https://github.com/moranegg/metadata_test' + }, + 'description': 'Simple package.json test for indexer', + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def content_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata_v1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + expected_metadatas_v1 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_drop_duplicated_metadata', + 'version': '0.0.1' + }, + }) + + self.storage.content_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def content_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-translator']['id'] + + metadata_v1 = { + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.content_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # then + expected_metadatas_v1 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + }) + self.storage.content_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.content_metadata_get( + [self.sha1_2])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.sha1_2, + 'translated_metadata': { + 'other': {}, + 'name': 'test_update_duplicated_metadata', + 'version': '0.0.1' + }, + 'tool': tools['swh-metadata-translator'] + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + + @istest + def revision_metadata_missing(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadatas = [ + { + 'id': self.revision_id_1, + 'indexer_configuration_id': tool_id, + }, + { + 'id': self.revision_id_2, + 'indexer_configuration_id': tool_id, + } + ] + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(list(actual_missing), [ + self.revision_id_1, + self.revision_id_2, + ]) + + # given + self.storage.revision_metadata_add([{ + 'id': self.revision_id_1, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + }]) + + # when + actual_missing = list(self.storage.revision_metadata_missing( + metadatas)) + + # then + self.assertEqual(actual_missing, [self.revision_id_2]) + + @istest + def revision_metadata_get(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_rev = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id + } + + # when + self.storage.revision_metadata_add([metadata_rev]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2, self.revision_id_1])) + + expected_metadatas = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_rev['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas) + + @istest + def revision_metadata_add_drop_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'id': self.revision_id_1, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_1])) + + expected_metadatas_v1 = [{ + 'id': self.revision_id_1, + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_metadata', + 'author': 'MG', + }, + }) + + self.storage.revision_metadata_add([metadata_v2]) + + # then + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_1])) + + # metadata did not change as the v2 was dropped. + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + @istest + def revision_metadata_add_update_in_place_duplicate(self): + # given + tools = self.fetch_tools() + tool_id = tools['swh-metadata-detector']['id'] + + metadata_v1 = { + 'id': self.revision_id_2, + 'translated_metadata': { + 'developmentStatus': None, + 'version': None, + 'operatingSystem': None, + 'description': None, + 'keywords': None, + 'issueTracker': None, + 'name': None, + 'author': None, + 'relatedLink': None, + 'url': None, + 'type': None, + 'license': None, + 'maintainer': None, + 'email': None, + 'softwareRequirements': None, + 'identifier': None + }, + 'indexer_configuration_id': tool_id, + } + + # given + self.storage.revision_metadata_add([metadata_v1]) + + # when + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2])) + + # then + expected_metadatas_v1 = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_v1['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + self.assertEqual(actual_metadatas, expected_metadatas_v1) + + # given + metadata_v2 = metadata_v1.copy() + metadata_v2.update({ + 'translated_metadata': { + 'name': 'test_update_duplicated_metadata', + 'author': 'MG' + }, + }) + self.storage.revision_metadata_add([metadata_v2], conflict_update=True) + + actual_metadatas = list(self.storage.revision_metadata_get( + [self.revision_id_2])) + + # language did not change as the v2 was dropped. + expected_metadatas_v2 = [{ + 'id': self.revision_id_2, + 'translated_metadata': metadata_v2['translated_metadata'], + 'tool': tools['swh-metadata-detector'] + }] + + # metadata did change as the v2 was used to overwrite v1 + self.assertEqual(actual_metadatas, expected_metadatas_v2) + + @istest + def indexer_configuration_add(self): + tool = { + 'tool_name': 'some-unknown-tool', + 'tool_version': 'some-version', + 'tool_configuration': {"debian-package": "some-package"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + self.assertIsNone(actual_tool) # does not exist + + # add it + actual_tools = list(self.storage.indexer_configuration_add([tool])) + + self.assertEquals(len(actual_tools), 1) + actual_tool = actual_tools[0] + self.assertIsNotNone(actual_tool) # now it exists + new_id = actual_tool.pop('id') + self.assertEquals(actual_tool, tool) + + actual_tools2 = list(self.storage.indexer_configuration_add([tool])) + actual_tool2 = actual_tools2[0] + self.assertIsNotNone(actual_tool2) # now it exists + new_id2 = actual_tool2.pop('id') + + self.assertEqual(new_id, new_id2) + self.assertEqual(actual_tool, actual_tool2) + + @istest + def indexer_configuration_add_multiple(self): + tool = { + 'tool_name': 'some-unknown-tool', + 'tool_version': 'some-version', + 'tool_configuration': {"debian-package": "some-package"}, + } + + actual_tools = list(self.storage.indexer_configuration_add([tool])) + self.assertEqual(len(actual_tools), 1) + + new_tools = [tool, { + 'tool_name': 'yet-another-tool', + 'tool_version': 'version', + 'tool_configuration': {}, + }] + + actual_tools = list(self.storage.indexer_configuration_add(new_tools)) + self.assertEqual(len(actual_tools), 2) + + # order not guaranteed, so we iterate over results to check + for tool in actual_tools: + _id = tool.pop('id') + self.assertIsNotNone(_id) + self.assertIn(tool, new_tools) + + @istest + def indexer_configuration_get_missing(self): + tool = { + 'tool_name': 'unknown-tool', + 'tool_version': '3.1.0rc2-31-ga2cbb8c', + 'tool_configuration': {"command_line": "nomossa "}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + self.assertIsNone(actual_tool) + + @istest + def indexer_configuration_get(self): + tool = { + 'tool_name': 'nomos', + 'tool_version': '3.1.0rc2-31-ga2cbb8c', + 'tool_configuration': {"command_line": "nomossa "}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + expected_tool = tool.copy() + expected_tool['id'] = 1 + + self.assertEqual(expected_tool, actual_tool) + + @istest + def indexer_configuration_metadata_get_missing_context(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"context": "unknown-context"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + self.assertIsNone(actual_tool) + + @istest + def indexer_configuration_metadata_get(self): + tool = { + 'tool_name': 'swh-metadata-translator', + 'tool_version': '0.0.1', + 'tool_configuration': {"type": "local", "context": "npm"}, + } + + actual_tool = self.storage.indexer_configuration_get(tool) + + expected_tool = tool.copy() + expected_tool['id'] = actual_tool['id'] + + self.assertEqual(expected_tool, actual_tool) + + +class IndexerTestStorage(CommonTestStorage, unittest.TestCase): + """Running the tests locally. + + For the client api tests (remote storage), see + `class`:swh.indexer.storage.test_api_client:TestRemoteStorage + class. + + """ + pass diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py index 96a431d..048f309 100644 --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -1,113 +1,113 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer import language from swh.indexer.language import ContentLanguageIndexer from swh.indexer.tests.test_utils import MockObjStorage -class MockStorage(): +class _MockIndexerStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_language_add(self, languages, conflict_update=None): self.state = languages self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 20, }] class TestLanguageIndexer(ContentLanguageIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'destination_queue': None, 'rescheduling_task': None, 'tools': { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, } } - self.storage = MockStorage() + self.idx_storage = _MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tool_config = self.config['tools']['configuration'] self.max_content_size = self.tool_config['max_content_size'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class Language(unittest.TestCase): """ Tests pygments tool for language detection """ def setUp(self): self.maxDiff = None @istest def test_compute_language_none(self): # given self.content = "" self.declared_language = { 'lang': None } # when result = language.compute_language(self.content) # then self.assertEqual(self.declared_language, result) @istest def test_index_content_language_python(self): # given # testing python sha1s = ['02fb2c89e14f7fab46701478c83779c7beb7b069'] lang_indexer = TestLanguageIndexer() # when lang_indexer.run(sha1s, policy_update='ignore-dups') - results = lang_indexer.storage.state + results = lang_indexer.idx_storage.state expected_results = [{ 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', 'indexer_configuration_id': 20, 'lang': 'python' }] # then self.assertEqual(expected_results, results) @istest def test_index_content_language_c(self): # given # testing c sha1s = ['103bc087db1d26afc3a0283f38663d081e9b01e6'] lang_indexer = TestLanguageIndexer() # when lang_indexer.run(sha1s, policy_update='ignore-dups') - results = lang_indexer.storage.state + results = lang_indexer.idx_storage.state expected_results = [{ 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', 'indexer_configuration_id': 20, 'lang': 'c' }] # then self.assertEqual('c', results[0]['lang']) self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index 74b8309..2953bfc 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -1,298 +1,305 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.metadata_dictionary import compute_metadata from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata import ContentMetadataIndexer from swh.indexer.metadata import RevisionMetadataIndexer -from swh.indexer.tests.test_utils import MockObjStorage -from swh.indexer.tests.test_utils import MockStorage +from swh.indexer.tests.test_utils import MockObjStorage, MockStorage +from swh.indexer.tests.test_utils import MockIndexerStorage class TestContentMetadataIndexer(ContentMetadataIndexer): """Specific Metadata whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config.update({ 'rescheduling_task': None, }) - self.storage = MockStorage() + self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class TestRevisionMetadataIndexer(RevisionMetadataIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'rescheduling_task': None, + 'storage': { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:9999', + } + }, 'tools': { 'name': 'swh-metadata-detector', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } } self.storage = MockStorage() + self.idx_storage = MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] self.results = [] class Metadata(unittest.TestCase): """ Tests metadata_mock_tool tool for Metadata detection """ def setUp(self): """ shows the entire diff in the results """ self.maxDiff = None self.content_tool = { 'name': 'swh-metadata-translator', 'version': '0.0.1', 'configuration': { 'type': 'local', 'context': 'npm' } } @istest def test_compute_metadata_none(self): """ testing content empty content is empty should return None """ # given content = b"" context = "npm" # None if no metadata was found or an error occurred declared_metadata = None # when result = compute_metadata(context, content) # then self.assertEqual(declared_metadata, result) @istest def test_compute_metadata_npm(self): """ testing only computation of metadata with hard_mapping_npm """ # given content = b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """ declared_metadata = { 'name': 'test_metadata', 'version': '0.0.1', 'description': 'Simple package.json test for indexer', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'other': {} } # when result = compute_metadata("npm", content) # then self.assertEqual(declared_metadata, result) @istest def test_index_content_metadata_npm(self): """ testing NPM with package.json - one sha1 uses a file that can't be translated to metadata and should return None in the translated metadata """ # given sha1s = ['26a9f72a7c87cc9205725cfd879f514ff4f3d8d5', 'd4c647f0fc257591cc9ba1722484229780d1c607', '02fb2c89e14f7fab46701478c83779c7beb7b069'] # this metadata indexer computes only metadata for package.json # in npm context with a hard mapping metadata_indexer = TestContentMetadataIndexer( tool=self.content_tool, config={}) # when metadata_indexer.run(sha1s, policy_update='ignore-dups') - results = metadata_indexer.storage.state + results = metadata_indexer.idx_storage.state expected_results = [{ 'indexer_configuration_id': 30, 'translated_metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'id': '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5' }, { 'indexer_configuration_id': 30, 'translated_metadata': { 'softwareRequirements': { 'JSONStream': '~1.3.1', 'abbrev': '~1.1.0', 'ansi-regex': '~2.1.1', 'ansicolors': '~0.3.2', 'ansistyles': '~0.1.3' }, 'issueTracker': { 'url': 'https://github.com/npm/npm/issues' }, 'author': 'Isaac Z. Schlueter (http://blog.izs.me)', 'codeRepository': { 'type': 'git', 'url': 'https://github.com/npm/npm' }, 'description': 'a package manager for JavaScript', 'softwareSuggestions': { 'tacks': '~1.2.6', 'tap': '~10.3.2' }, 'license': 'Artistic-2.0', 'version': '5.0.3', 'other': { 'preferGlobal': True, 'config': { 'publishtest': False } }, 'name': 'npm', 'keywords': [ 'install', 'modules', 'package manager', 'package.json' ], 'url': 'https://docs.npmjs.com/' }, 'id': 'd4c647f0fc257591cc9ba1722484229780d1c607' }, { 'indexer_configuration_id': 30, 'translated_metadata': None, 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069' }] # The assertion bellow returns False sometimes because of nested lists self.assertEqual(expected_results, results) @istest def test_detect_metadata_package_json(self): # given df = [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'dir_a', 'sha1': b'cde' }] # when results = detect_metadata(df) expected_results = { 'npm': [ b'cde' ] } # then self.assertEqual(expected_results, results) @istest def test_revision_metadata_indexer(self): metadata_indexer = TestRevisionMetadataIndexer() sha1_gits = [ b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', ] metadata_indexer.run(sha1_gits, 'update-dups') - results = metadata_indexer.storage.state + results = metadata_indexer.idx_storage.state expected_results = [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'translated_metadata': { 'identifier': None, 'maintainer': None, 'url': [ 'https://github.com/librariesio/yarn-parser#readme' ], 'codeRepository': [{ 'type': 'git', 'url': 'git+https://github.com/librariesio/yarn-parser.git' }], 'author': ['Andrew Nesbitt'], 'license': ['AGPL-3.0'], 'version': ['1.0.0'], 'description': [ 'Tiny web service for parsing yarn.lock files' ], 'relatedLink': None, 'developmentStatus': None, 'operatingSystem': None, 'issueTracker': [{ 'url': 'https://github.com/librariesio/yarn-parser/issues' }], 'softwareRequirements': [{ 'express': '^4.14.0', 'yarn': '^0.21.0', 'body-parser': '^1.15.2' }], 'name': ['yarn-parser'], 'keywords': [['yarn', 'parse', 'lock', 'dependencies']], 'type': None, 'email': None }, 'indexer_configuration_id': 7 }] # then self.assertEqual(expected_results, results) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index a15b971..63f6044 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,158 +1,158 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage -class _MockStorage(): +class _MockIndexerStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_mimetype_add(self, mimetypes, conflict_update=None): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] class TestMimetypeIndexer(ContentMimetypeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'destination_queue': None, 'rescheduling_task': None, 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } - self.storage = _MockStorage() + self.idx_storage = _MockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.destination_queue = self.config['destination_queue'] self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class TestMimetypeIndexerUnknownToolStorage(TestMimetypeIndexer): """Specific mimetype whose configuration is not enough to satisfy the indexing tests. """ def prepare(self): super().prepare() self.tools = None class TestMimetypeIndexerWithErrors(unittest.TestCase): @istest def wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool should fail the check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): TestMimetypeIndexerUnknownToolStorage() class TestMimetypeIndexerTest(unittest.TestCase): def setUp(self): self.indexer = TestMimetypeIndexer() @istest def test_index_no_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', ] # when self.indexer.run(sha1s, policy_update='ignore-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] - self.assertFalse(self.indexer.storage.conflict_update) - self.assertEquals(expected_results, self.indexer.storage.state) + self.assertFalse(self.indexer.idx_storage.conflict_update) + self.assertEquals(expected_results, self.indexer.idx_storage.state) @istest def test_index_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content ] # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'indexer_configuration_id': 10, 'mimetype': b'application/x-empty', 'encoding': b'binary', }] - self.assertTrue(self.indexer.storage.conflict_update) - self.assertEquals(expected_results, self.indexer.storage.state) + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEquals(expected_results, self.indexer.idx_storage.state) @istest def test_index_one_unknown_sha1(self): # given sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] - self.assertTrue(self.indexer.storage.conflict_update) - self.assertEquals(expected_results, self.indexer.storage.state) + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEquals(expected_results, self.indexer.idx_storage.state) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 3626af8..41c9068 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,253 +1,261 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from swh.objstorage.exc import ObjNotFoundError class MockObjStorage: - """Mock objstorage with predefined contents. + """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content -class MockStorage(): - """Mock storage to simplify reading indexers' outputs. - """ - def content_metadata_missing(self, sha1s): - yield from [] - - def content_metadata_add(self, metadata, conflict_update=None): - self.state = metadata - self.conflict_update = conflict_update - - def revision_metadata_add(self, metadata, conflict_update=None): - self.state = metadata - self.conflict_update = conflict_update +class MockIndexerStorage(): + """Mock an swh-indexer storage. + """ def indexer_configuration_add(self, tools): tool = tools[0] if tool['tool_name'] == 'swh-metadata-translator': return [{ 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] elif tool['tool_name'] == 'swh-metadata-detector': return [{ 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'npm' }, }] + def content_metadata_missing(self, sha1s): + yield from [] + + def content_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def revision_metadata_add(self, metadata, conflict_update=None): + self.state = metadata + self.conflict_update = conflict_update + + def content_metadata_get(self, sha1s): + return [{ + 'tool': { + 'configuration': { + 'type': 'local', + 'context': 'npm' + }, + 'version': '0.0.1', + 'id': 6, + 'name': 'swh-metadata-translator' + }, + 'id': b'cde', + 'translated_metadata': { + 'issueTracker': { + 'url': 'https://github.com/librariesio/yarn-parser/issues' + }, + 'version': '1.0.0', + 'name': 'yarn-parser', + 'author': 'Andrew Nesbitt', + 'url': 'https://github.com/librariesio/yarn-parser#readme', + 'processorRequirements': {'node': '7.5'}, + 'other': { + 'scripts': { + 'start': 'node index.js' + }, + 'main': 'index.js' + }, + 'license': 'AGPL-3.0', + 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'codeRepository': { + 'type': 'git', + 'url': 'git+https://github.com/librariesio/yarn-parser.git' + }, + 'description': 'Tiny web service for parsing yarn.lock files', + 'softwareRequirements': { + 'yarn': '^0.21.0', + 'express': '^4.14.0', + 'body-parser': '^1.15.2'} + } + }] + + +class MockStorage(): + """Mock a real swh-storage storage to simplify reading indexers' + outputs. + + """ def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }] - - def content_metadata_get(self, sha1s): - return [{ - 'tool': { - 'configuration': { - 'type': 'local', - 'context': 'npm' - }, - 'version': '0.0.1', - 'id': 6, - 'name': 'swh-metadata-translator' - }, - 'id': b'cde', - 'translated_metadata': { - 'issueTracker': { - 'url': 'https://github.com/librariesio/yarn-parser/issues' - }, - 'version': '1.0.0', - 'name': 'yarn-parser', - 'author': 'Andrew Nesbitt', - 'url': 'https://github.com/librariesio/yarn-parser#readme', - 'processorRequirements': {'node': '7.5'}, - 'other': { - 'scripts': { - 'start': 'node index.js' - }, - 'main': 'index.js' - }, - 'license': 'AGPL-3.0', - 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], - 'codeRepository': { - 'type': 'git', - 'url': 'git+https://github.com/librariesio/yarn-parser.git' - }, - 'description': 'Tiny web service for parsing yarn.lock files', - 'softwareRequirements': { - 'yarn': '^0.21.0', - 'express': '^4.14.0', - 'body-parser': '^1.15.2'} - } - }] diff --git a/version.txt b/version.txt index a1e375b..61c6c3c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.43-0-g3e4641b \ No newline at end of file +v0.0.44-0-g30a35bf \ No newline at end of file