diff --git a/docker/Dockerfile b/docker/Dockerfile index 918b89c..5293ef4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,59 +1,65 @@ FROM python:3.7 RUN . /etc/os-release && echo "deb http://apt.postgresql.org/pub/repos/apt ${VERSION_CODENAME}-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ echo "deb [signed-by=/usr/share/keyrings/nodejs-archive-keyring.gpg] https://deb.nodesource.com/node_12.x ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/nodejs.list && \ curl -fsSL https://deb.nodesource.com/gpgkey/nodesource.gpg.key | gpg --dearmor > /usr/share/keyrings/nodejs-archive-keyring.gpg && \ echo "deb [signed-by=/usr/share/keyrings/yarnpkg-archive-keyring.gpg] https://dl.yarnpkg.com/debian/ stable main" > /etc/apt/sources.list.d/yarnpkg.list && \ curl -fsSL https://dl.yarnpkg.com/debian/pubkey.gpg | gpg --dearmor > /usr/share/keyrings/yarnpkg-archive-keyring.gpg RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && apt-get upgrade -y && \ apt-get install -y \ libapr1-dev \ libaprutil1-dev \ libpq-dev \ libsvn-dev \ libsystemd-dev \ + gcc \ memcached \ + openjdk-11-jre \ + pkg-config \ + pv \ postgresql-client-12 \ wait-for-it \ ngrep \ rsync \ nodejs \ - yarn && \ + yarn \ + zstd && \ apt-get install -y --no-install-recommends \ opam \ r-base-core \ r-cran-jsonlite && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* + RUN useradd -md /srv/softwareheritage -s /bin/bash swh USER swh RUN python3 -m venv /srv/softwareheritage/venv ENV PATH="/srv/softwareheritage/venv/bin:${PATH}" RUN pip install --upgrade pip setuptools wheel RUN pip install gunicorn httpie RUN pip install \ swh-core[db,http] \ swh-counters \ swh-deposit[server] \ swh-indexer \ swh-journal \ swh-lister \ swh-loader-core \ swh-loader-git \ swh-loader-mercurial \ swh-loader-svn \ swh-storage \ swh-objstorage \ swh-scheduler \ swh-vault \ swh-web COPY utils/*.sh /srv/softwareheritage/utils/ RUN mkdir -p /srv/softwareheritage/objects RUN rm -rd /srv/softwareheritage/.cache diff --git a/docker/conf/graph.yml b/docker/conf/graph.yml new file mode 100644 index 0000000..1125aa4 --- /dev/null +++ b/docker/conf/graph.yml @@ -0,0 +1,7 @@ +journal: + brokers: + - kafka + prefix: swh.journal.objects + +graph: + path: /srv/softwareheritage/graph/compressed/graph diff --git a/docker/docker-compose.graph.yml b/docker/docker-compose.graph.yml new file mode 100644 index 0000000..2e58630 --- /dev/null +++ b/docker/docker-compose.graph.yml @@ -0,0 +1,23 @@ +version: "2.1" + +services: + swh-graph: + image: swh/stack + build: ./ + entrypoint: /entrypoint.sh + ports: + - 5009:5009 + environment: + SWH_CONFIG_FILENAME: /graph.yml + depends_on: + kafka: + condition: service_healthy + env_file: + - ./env/common_python.env + volumes: + - "./conf/graph.yml:/graph.yml:ro" + - "./services/swh-graph/entrypoint.sh:/entrypoint.sh:ro" + - "../swh-graph:/src/swh-graph" + - "../swh-dataset:/src/swh-dataset" + + diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index d768149..4b659d0 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,471 +1,475 @@ version: "2.1" services: amqp: image: rabbitmq:3.6-management ports: - 5072:5672 zookeeper: image: wurstmeister/zookeeper restart: always ports: - "5081:2181" environment: ZOO_LOG4J_PROP: WARN,CONSOLE kafka: image: wurstmeister/kafka ports: - "9092:9092" env_file: ./env/kafka.env environment: - KAFKA_CREATE_TOPICS: swh.journal.objects.content:1:1, - swh.journal.objects.origin:1:1, - swh.journal.objects.origin_visit:1:1, - swh.journal.objects.origin_visit_status:1:1, - swh.journal.objects.revision:1:1, - swh.journal.objects.release:1:1, - swh.journal.indexed.origin_intrinsic_metadata:1:1 + KAFKA_BROKER_ID: 1 + KAFKA_CREATE_TOPICS: swh.journal.objects.content:16:1, + swh.journal.objects.directory:16:1, + swh.journal.objects.origin:16:1, + swh.journal.objects.origin_visit:16:1, + swh.journal.objects.origin_visit_status:16:1, + swh.journal.objects.revision:16:1, + swh.journal.objects.release:16:1, + swh.journal.objects.skipped_content:16:1, + swh.journal.objects.snapshot:16:1, + swh.journal.indexed.origin_intrinsic_metadata:16:1 depends_on: - zookeeper healthcheck: test: "[ `JMX_PORT= kafka-topics.sh --list --zookeeper zookeeper:2181 | wc -l` -ge 6 ]" interval: 10s timeout: 5s retries: 10 cmak: # Note: CMAK does not work out of the box, you need to run this first: # $ docker-compose exec zookeeper ./bin/zkCli.sh # create /kafka-manager/mutex "" # create /kafka-manager/mutex/locks "" # create /kafka-manager/mutex/leases "" # See: https://github.com/yahoo/CMAK/issues/731#issuecomment-643880544 image: hlebalbau/kafka-manager:stable environment: ZK_HOSTS: "zookeeper:2181" prometheus: image: prom/prometheus depends_on: - prometheus-statsd-exporter command: # Needed for the reverse-proxy - "--web.external-url=/prometheus" - "--config.file=/etc/prometheus/prometheus.yml" volumes: - "./conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro" restart: unless-stopped prometheus-statsd-exporter: image: prom/statsd-exporter command: - "--statsd.mapping-config=/etc/prometheus/statsd-mapping.yml" volumes: - "./conf/prometheus-statsd-mapping.yml:/etc/prometheus/statsd-mapping.yml:ro" restart: unless-stopped prometheus-rabbitmq-exporter: image: kbudde/rabbitmq-exporter restart: unless-stopped environment: SKIP_QUEUES: "RPC_.*" MAX_QUEUES: 5000 RABBIT_URL: http://amqp:15672 LOG_LEVEL: warning grafana: image: grafana/grafana restart: unless-stopped depends_on: - prometheus environment: GF_SERVER_ROOT_URL: http://localhost:5080/grafana volumes: - "./conf/grafana/provisioning:/etc/grafana/provisioning:ro" - "./conf/grafana/dashboards:/var/lib/grafana/dashboards" nginx: image: nginx volumes: - "./conf/nginx.conf:/etc/nginx/nginx.conf:ro" ports: - 5080:5080 # Scheduler swh-scheduler-db: image: postgres:12 env_file: - ./env/common_python.env - ./env/scheduler-db.env swh-scheduler: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/scheduler-db.env - ./env/scheduler.env environment: SWH_CONFIG_FILENAME: /scheduler.yml SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml entrypoint: /entrypoint.sh depends_on: - swh-scheduler-db ports: - 5008:5008 volumes: - "./conf/scheduler.yml:/scheduler.yml:ro" - "./services/swh-scheduler/entrypoint.sh:/entrypoint.sh:ro" swh-scheduler-listener: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/scheduler-db.env - ./env/scheduler.env environment: SWH_CONFIG_FILENAME: /scheduler.yml SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml entrypoint: /entrypoint.sh command: start-listener depends_on: - swh-scheduler - amqp volumes: - "./conf/scheduler.yml:/scheduler.yml:ro" - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro" swh-scheduler-runner: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/scheduler-db.env - ./env/scheduler.env environment: SWH_CONFIG_FILENAME: /scheduler.yml SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml entrypoint: /entrypoint.sh command: start-runner --period 10 depends_on: - swh-scheduler - amqp volumes: - "./conf/scheduler.yml:/scheduler.yml:ro" - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro" swh-scheduler-runner-priority: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/scheduler-db.env - ./env/scheduler.env environment: SWH_CONFIG_FILENAME: /scheduler.yml SWH_SCHEDULER_CONFIG_FILE: /scheduler.yml entrypoint: /entrypoint.sh command: start-runner --period 10 --with-priority --task-type load-git --task-type load-svn --task-type load-hg --task-type load-archive-files depends_on: - swh-scheduler - amqp volumes: - "./conf/scheduler.yml:/scheduler.yml:ro" - "./services/swh-scheduler-worker/entrypoint.sh:/entrypoint.sh:ro" # Graph storage swh-storage-db: image: postgres:12 env_file: - ./env/storage-db.env swh-storage: image: swh/stack build: ./ ports: - 5002:5002 depends_on: - swh-storage-db - swh-objstorage - kafka env_file: - ./env/common_python.env - ./env/storage.env environment: SWH_CONFIG_FILENAME: /storage.yml STORAGE_BACKEND: postgresql entrypoint: /entrypoint.sh volumes: - "./conf/storage.yml:/storage.yml:ro" - "./services/swh-storage/entrypoint.sh:/entrypoint.sh:ro" # Object storage swh-objstorage: build: ./ image: swh/stack ports: - 5003:5003 env_file: - ./env/common_python.env environment: SWH_CONFIG_FILENAME: /objstorage.yml entrypoint: /entrypoint.sh volumes: - "./conf/objstorage.yml:/objstorage.yml:ro" - "./services/swh-objstorage/entrypoint.sh:/entrypoint.sh:ro" # Indexer storage swh-idx-storage-db: image: postgres:12 env_file: - ./env/indexers-db.env swh-idx-storage: image: swh/stack build: ./ ports: - 5007:5007 depends_on: - swh-idx-storage-db env_file: - ./env/common_python.env - ./env/indexers-db.env - ./env/indexers.env environment: SWH_CONFIG_FILENAME: /indexer_storage.yml entrypoint: /entrypoint.sh volumes: - "./conf/indexer_storage.yml:/indexer_storage.yml:ro" - "./services/swh-indexer-storage/entrypoint.sh:/entrypoint.sh:ro" # Web interface swh-web-db: image: postgres:12 env_file: - ./env/01-web-db.env swh-web: build: ./ image: swh/stack ports: - 3000:3000 - 5004:5004 depends_on: - swh-idx-storage - swh-scheduler - swh-storage - swh-web-db env_file: - ./env/common_python.env - ./env/01-web-db.env - ./env/02-web-db.env environment: VERBOSITY: 3 DJANGO_SETTINGS_MODULE: swh.web.settings.production SWH_CONFIG_FILENAME: /web.yml entrypoint: /entrypoint.sh volumes: - "./conf/web.yml:/web.yml:ro" - "./services/swh-web/entrypoint.sh:/entrypoint.sh:ro" swh-deposit-db: image: postgres:12 env_file: - ./env/deposit-db.env swh-deposit: image: swh/stack build: ./ ports: - 5006:5006 depends_on: - swh-deposit-db - swh-scheduler env_file: - ./env/common_python.env - ./env/deposit-db.env - ./env/deposit.env environment: VERBOSITY: 3 SWH_CONFIG_FILENAME: /deposit.yml DJANGO_SETTINGS_MODULE: swh.deposit.settings.production entrypoint: /entrypoint.sh volumes: - "./conf/deposit.yml:/deposit.yml:ro" - "./services/swh-deposit/entrypoint.sh:/entrypoint.sh:ro" swh-vault-db: image: postgres:12 env_file: - ./env/vault-db.env swh-vault: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/vault-db.env - ./env/vault.env environment: SWH_CONFIG_FILENAME: /vault.yml command: server ports: - 5005:5005 depends_on: - swh-vault-db - swh-objstorage - swh-storage - swh-scheduler entrypoint: /entrypoint.sh volumes: - "./conf/vault.yml:/vault.yml:ro" - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro" swh-vault-worker: image: swh/stack build: ./ command: worker env_file: - ./env/common_python.env - ./env/workers.env environment: SWH_CONFIG_FILENAME: /cooker.yml depends_on: - swh-vault - swh-storage entrypoint: /entrypoint.sh volumes: - "./conf/vault-worker.yml:/cooker.yml:ro" - "./services/swh-vault/entrypoint.sh:/entrypoint.sh:ro" # Lister Celery workers swh-lister: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/listers.env - ./env/workers.env user: swh environment: SWH_WORKER_INSTANCE: listers SWH_CONFIG_FILENAME: /lister.yml depends_on: - swh-scheduler - swh-scheduler-runner - amqp entrypoint: /entrypoint.sh volumes: - "./conf/lister.yml:/lister.yml:ro" - "./services/swh-listers-worker/entrypoint.sh:/entrypoint.sh:ro" # Loader + deposit checker Celery workers swh-loader: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/workers.env user: swh environment: SWH_WORKER_INSTANCE: loader SWH_CONFIG_FILENAME: /loader.yml entrypoint: /entrypoint.sh depends_on: - swh-storage - swh-scheduler - amqp volumes: - "./conf/loader.yml:/loader.yml:ro" - "./services/swh-worker/entrypoint.sh:/entrypoint.sh:ro" swh-loader-deposit: image: swh/stack build: ./ env_file: - ./env/common_python.env - ./env/workers.env user: swh environment: SWH_WORKER_INSTANCE: loader-deposit SWH_CONFIG_FILENAME: /loader-deposit.yml entrypoint: /entrypoint.sh depends_on: - swh-storage - swh-scheduler - swh-deposit - amqp volumes: - "./conf/loader-deposit.yml:/loader-deposit.yml:ro" - "./services/swh-worker/entrypoint.sh:/entrypoint.sh:ro" # Indexer Celery workers swh-indexer: image: swh/stack build: ./ user: swh env_file: - ./env/common_python.env - ./env/indexers-db.env - ./env/indexers.env - ./env/workers.env environment: SWH_WORKER_INSTANCE: indexer SWH_CONFIG_FILENAME: /indexer.yml CONCURRENCY: 4 entrypoint: /entrypoint.sh depends_on: - swh-scheduler-runner - swh-idx-storage - swh-storage - swh-objstorage - amqp volumes: - "./conf/indexer.yml:/indexer.yml:ro" - "./services/swh-indexer-worker/entrypoint.sh:/entrypoint.sh:ro" # Journal related swh-indexer-journal-client: image: swh/stack build: ./ entrypoint: /entrypoint.sh env_file: - ./env/common_python.env depends_on: kafka: condition: service_healthy swh-storage: condition: service_started swh-scheduler: condition: service_started volumes: - "./conf/indexer_journal_client.yml:/etc/softwareheritage/indexer/journal_client.yml:ro" - "./services/swh-indexer-journal-client/entrypoint.sh:/entrypoint.sh:ro" swh-scheduler-journal-client: image: swh/stack build: ./ entrypoint: /entrypoint.sh env_file: - ./env/common_python.env depends_on: kafka: condition: service_healthy swh-scheduler: condition: service_started volumes: - "./conf/scheduler_journal_client.yml:/etc/softwareheritage/scheduler/journal_client.yml:ro" - "./services/swh-scheduler-journal-client/entrypoint.sh:/entrypoint.sh:ro" diff --git a/docker/services/swh-graph/entrypoint.sh b/docker/services/swh-graph/entrypoint.sh new file mode 100755 index 0000000..b7eaa2c --- /dev/null +++ b/docker/services/swh-graph/entrypoint.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +source /srv/softwareheritage/utils/pyutils.sh +setup_pip + +DATADIR=/srv/softwareheritage/graph + +case "$1" in + "shell") + exec bash -i + ;; + *) + mkdir -p $DATADIR/ + rm -rf $DATADIR/* # cleanup results from previous runs + mkdir $DATADIR/g/ + echo "Exporting edges and nodes" + swh dataset -C $SWH_CONFIG_FILENAME graph export $DATADIR/g --processes=4 + echo "Sorting edges and nodes" + swh dataset graph sort $DATADIR/g/edges + echo "Compressing graph" + swh graph compress --graph $DATADIR/g/edges/graph --outdir $DATADIR/compressed + echo "Starting the swh-graph API server" + exec gunicorn --bind 0.0.0.0:5009 \ + --worker-class aiohttp.worker.GunicornWebWorker \ + --reload \ + --threads 4 \ + --workers 2 \ + --log-level DEBUG \ + --timeout 3600 \ + --config 'python:swh.core.api.gunicorn_config' \ + 'swh.graph.server.app:make_app_from_configfile()' + ;; +esac