diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..aec7fad --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.tox +.eggs diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..07dfbdd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.9 + +COPY requirements* / +COPY docker/pyutils.sh /usr/local/bin + +RUN apt-get update && apt-get -y install rsync libcmph-dev && \ + addgroup --gid 1000 swh && \ + useradd --gid 1000 --uid 1000 -m -d /src swh && \ + chmod a+x /usr/local/bin/pyutils.sh + +USER swh + +RUN python -m venv /src/venv && \ + . /src/venv/bin/activate && \ + python -m pip install --upgrade pip && \ + ls /requirements* | xargs -t -n1 pip install -r + +ENTRYPOINT /entrypoint.sh + +ENV SWH_CONFIG_FILENAME=/config.yml diff --git a/README.md b/README.md index 47337c8..d8e6e8a 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,50 @@ swh-provenance ============== Provenance DB module to query the provenance of source code artifacts present in the Software Heritage archive. This project allows to build such a provenance db from the Software Heritage Archive, and query this database. ## Building a provenance database Building the provenance database requires a read access to the Software Heritage archive, either via a direct access to the database (preferred for better performances), or using the RPC API to a Software Heritage Storage instance. It also need a postgresql database in which the provenance db will be written into. A configuration file is needed with with the access to both these databases: ``` archive: cls: api storage: cls: remote url: http://uffizi.internal.softwareheritage.org:5002 provenance: cls: direct db: dbname: provenance host: localhost ``` + +Running in Docker +----------------- + +### Build the image +``` +docker build -t swh-provenance . +``` + +### Run the services +``` +docker-compose up -d +docker-compose logs -f +``` diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ce6009d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,53 @@ +version: '3' + +volumes: + storage-db: + +services: + storage-db: + image: postgres:13 + command: [ "-c", "max_connections=1000"] + environment: + POSTGRES_PASSWORD: provenancepassword + POSTGRES_DB: provenance + POSTGRES_USER: provenance + volumes: + - storage-db:/var/lib/postgresql + - ./swh/provenance/sql:/docker-entrypoint-initdb.d + + rabbitmq: + image: rabbitmq:3.6-management + ports: + - 5072:5672 + + storage: + image: swh-provenance + build: . + volumes: + - $PWD:/src/swh-provenance + - ./docker/storage/entrypoint.sh:/entrypoint.sh + - ./docker/storage/config.yml:/config.yml + depends_on: + - storage-db + - rabbitmq + + origin_server: + image: swh-provenance + build: . + volumes: + - $PWD:/src/swh-provenance + - ./docker/origin_server/entrypoint.sh:/entrypoint.sh + - ./docker/origin_server/config.yml:/config.yml + - ./docker/data/origins.csv:/origins.csv + ports: + - 5555:5555 + + origin_client: + image: swh-provenance + build: . + volumes: + - $PWD:/src/swh-provenance + - ./docker/origin_client/entrypoint.sh:/entrypoint.sh + - ./docker/origin_client/config.yml:/config.yml + environment: + NB_CLIENTS: 3 diff --git a/docker/origin_client/config.yml b/docker/origin_client/config.yml new file mode 100644 index 0000000..7caf996 --- /dev/null +++ b/docker/origin_client/config.yml @@ -0,0 +1,36 @@ +provenance: + storage: + cls: rabbitmq # client configuration + url: amqp://rabbitmq:5672/%2f + storage_config: + cls: postgresql + db: + host: storage-db + dbname: provenance + user: provenance + password: provenancepassword + batch_size: 10000 + prefetch_count: 100 + wait_min: 60 + wait_per_batch: 60 + + + archive: + #cls: graph + #url: http://graph.internal.softwareheritage.org:5009/graph + # cls: direct + # db: + # host: swh-storage-db + # port: 5432 + # dbname: swh + # user: guest + cls: api + storage: + cls: remote + # url: http://webapp.internal.staging.swh.network:5002/ + url: http://webapp1.internal.softwareheritage.org:5002 + + org_server: # origin provider + host: origin_server + port: 5555 + batch_size: 1 diff --git a/docker/origin_client/entrypoint.sh b/docker/origin_client/entrypoint.sh new file mode 100755 index 0000000..3d171a3 --- /dev/null +++ b/docker/origin_client/entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +. /usr/local/bin/pyutils.sh + +. /src/venv/bin/activate + +setup_pip + +cd /src/swh-provenance +python swh/provenance/tools/origins/client.py ${NB_CLIENTS} diff --git a/docker/origin_server/config.yml b/docker/origin_server/config.yml new file mode 100644 index 0000000..bd25baf --- /dev/null +++ b/docker/origin_server/config.yml @@ -0,0 +1,5 @@ +provenance: + org_server: # origin provider + host: localhost + port: 5555 + batch_size: 1 diff --git a/docker/origin_server/entrypoint.sh b/docker/origin_server/entrypoint.sh new file mode 100755 index 0000000..75a2aa4 --- /dev/null +++ b/docker/origin_server/entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +. /usr/local/bin/pyutils.sh + +. /src/venv/bin/activate + +setup_pip + +cd /src/swh-provenance || exit +python swh/provenance/tools/origins/server.py /origins.csv diff --git a/docker/pyutils.sh b/docker/pyutils.sh new file mode 100755 index 0000000..bb3237b --- /dev/null +++ b/docker/pyutils.sh @@ -0,0 +1,28 @@ +#!/bin/bash + + +setup_pip () { + echo Using pip from $(which pip) + + if [[ -d /src ]] ; then + tmpdir=`mktemp -d` + pushd /src + for srcrepo in swh-* ; do + if [ -w $srcrepo ] + then + # Install package in editable mode if source directory is writable + pip install -e $srcrepo + else + # Source directories might not be writeable, but building them writes + # in-tree; so we're copying them to a location guaranteed to be writeable. + rsync -a --chmod=+w $srcrepo $tmpdir/ --exclude "*/__pycache__/" --exclude "*/.tox/" --exclude "*/.hypothesis/" + pip install $tmpdir/$srcrepo + fi + done + popd + rm -rf $tmpdir + fi + + echo Installed Python packages: + pip list +} diff --git a/docker/storage/config.yml b/docker/storage/config.yml new file mode 100644 index 0000000..8ed06fd --- /dev/null +++ b/docker/storage/config.yml @@ -0,0 +1,12 @@ +provenance: + rabbitmq: # remote storage server configuration + url: amqp://rabbitmq:5672/%2f + storage_config: + cls: postgresql + db: + host: storage-db + dbname: provenance + user: provenance + password: provenancepassword + batch_size: 10000 + prefetch_count: 100 diff --git a/docker/storage/entrypoint.sh b/docker/storage/entrypoint.sh new file mode 100755 index 0000000..0a5f60c --- /dev/null +++ b/docker/storage/entrypoint.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +. /usr/local/bin/pyutils.sh + +. /src/venv/bin/activate + +setup_pip + +cd /src/swh-provenance || exit + +cat < /tmp/start_storage.py +import swh.provenance.api.server +import os + +# the following methods uses environment variable SWH_CONFIG_FILENAME +# to retrieve the configuration file +server = swh.provenance.api.server.make_server_from_configfile() + +server.start() +while True: + try: + command = input("Enter EXIT to stop service: ") + if command.lower() == "exit": + break + except KeyboardInterrupt: + pass +server.stop() +EOF + +python /tmp/start_storage.py