diff --git a/docker/Dockerfile b/docker/Dockerfile --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,13 +15,18 @@ libpq-dev \ libsvn-dev \ libsystemd-dev \ + gcc \ memcached \ + openjdk-11-jre \ + pkg-config \ + pv \ postgresql-client-12 \ wait-for-it \ ngrep \ rsync \ nodejs \ - yarn && \ + yarn \ + zstd && \ apt-get install -y --no-install-recommends \ opam \ r-base-core \ @@ -29,6 +34,7 @@ apt-get clean && \ rm -rf /var/lib/apt/lists/* + RUN useradd -md /srv/softwareheritage -s /bin/bash swh USER swh diff --git a/docker/conf/graph.yml b/docker/conf/graph.yml new file mode 100644 --- /dev/null +++ b/docker/conf/graph.yml @@ -0,0 +1,7 @@ +journal: + brokers: + - kafka + prefix: swh.journal.objects + +graph: + path: /srv/softwareheritage/graph/compressed/graph diff --git a/docker/docker-compose.graph.yml b/docker/docker-compose.graph.yml new file mode 100644 --- /dev/null +++ b/docker/docker-compose.graph.yml @@ -0,0 +1,23 @@ +version: "2.1" + +services: + swh-graph: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + ports: + - 5009:5009 + environment: + SWH_CONFIG_FILENAME: /graph.yml + depends_on: + kafka: + condition: service_healthy + env_file: + - ./env/common_python.env + volumes: + - "./conf/graph.yml:/graph.yml:ro" + - "./services/swh-graph/entrypoint.sh:/entrypoint.sh:ro" + - "../swh-graph:/src/swh-graph" + - "../swh-dataset:/src/swh-dataset" + + diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -20,13 +20,17 @@ - "9092:9092" env_file: ./env/kafka.env environment: - KAFKA_CREATE_TOPICS: swh.journal.objects.content:1:1, - swh.journal.objects.origin:1:1, - swh.journal.objects.origin_visit:1:1, - swh.journal.objects.origin_visit_status:1:1, - swh.journal.objects.revision:1:1, - swh.journal.objects.release:1:1, - swh.journal.indexed.origin_intrinsic_metadata:1:1 + KAFKA_BROKER_ID: 1 + KAFKA_CREATE_TOPICS: swh.journal.objects.content:16:1, + swh.journal.objects.directory:16:1, + swh.journal.objects.origin:16:1, + swh.journal.objects.origin_visit:16:1, + swh.journal.objects.origin_visit_status:16:1, + swh.journal.objects.revision:16:1, + swh.journal.objects.release:16:1, + swh.journal.objects.skipped_content:16:1, + swh.journal.objects.snapshot:16:1, + swh.journal.indexed.origin_intrinsic_metadata:16:1 depends_on: - zookeeper healthcheck: diff --git a/docker/services/swh-graph/entrypoint.sh b/docker/services/swh-graph/entrypoint.sh new file mode 100755 --- /dev/null +++ b/docker/services/swh-graph/entrypoint.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +DATADIR=/srv/softwareheritage/graph + +case "$1" in + "shell") + exec bash -i + ;; + *) + mkdir -p $DATADIR/ + rm -rf $DATADIR/* # cleanup results from previous runs + mkdir $DATADIR/g/ + echo "Exporting edges and nodes" + swh dataset -C $SWH_CONFIG_FILENAME graph export $DATADIR/g --processes=4 + echo "Sorting edges and nodes" + swh dataset graph sort $DATADIR/g/edges + echo "Compressing graph" + swh graph compress --graph $DATADIR/g/edges/graph --outdir $DATADIR/compressed + echo "Starting the swh-graph API server" + exec gunicorn --bind 0.0.0.0:5009 \ + --worker-class aiohttp.worker.GunicornWebWorker \ + --reload \ + --threads 4 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + --config 'python:swh.core.api.gunicorn_config' \ + 'swh.graph.server.app:make_app_from_configfile()' + ;; +esac