Page MenuHomeSoftware Heritage

D6032.diff
No OneTemporary

D6032.diff

diff --git a/docker/Dockerfile b/docker/Dockerfile
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -15,13 +15,18 @@
libpq-dev \
libsvn-dev \
libsystemd-dev \
+ gcc \
memcached \
+ openjdk-11-jre \
+ pkg-config \
+ pv \
postgresql-client-12 \
wait-for-it \
ngrep \
rsync \
nodejs \
- yarn && \
+ yarn \
+ zstd && \
apt-get install -y --no-install-recommends \
opam \
r-base-core \
@@ -29,6 +34,7 @@
apt-get clean && \
rm -rf /var/lib/apt/lists/*
+
RUN useradd -md /srv/softwareheritage -s /bin/bash swh
USER swh
diff --git a/docker/README.rst b/docker/README.rst
--- a/docker/README.rst
+++ b/docker/README.rst
@@ -94,8 +94,14 @@
Keyword args:
url=https://0xacab.org/api/v4
-This task will scrape the forge’s project list and create subtasks to
-inject each git repository found there.
+This task will scrape the forge’s project list and register origins to the scheduler.
+This takes at most a couple of minutes.
+
+Then, you must tell the scheduler to create loading tasks for these origins.
+For example, to create tasks for 100 of these origins::
+
+ ~/swh-environment/docker$ docker-compose exec swh-scheduler \
+ swh scheduler origin schedule-next git 100
This will take a bit of time to complete.
diff --git a/docker/conf/graph.yml b/docker/conf/graph.yml
new file mode 100644
--- /dev/null
+++ b/docker/conf/graph.yml
@@ -0,0 +1,7 @@
+journal:
+ brokers:
+ - kafka
+ prefix: swh.journal.objects
+
+graph:
+ path: /srv/softwareheritage/graph/compressed/graph
diff --git a/docker/docker-compose.graph.yml b/docker/docker-compose.graph.yml
new file mode 100644
--- /dev/null
+++ b/docker/docker-compose.graph.yml
@@ -0,0 +1,23 @@
+version: "2.1"
+
+services:
+ swh-graph:
+ image: swh/stack
+ build: ./
+ entrypoint: /entrypoint.sh
+ ports:
+ - 5009:5009
+ environment:
+ SWH_CONFIG_FILENAME: /graph.yml
+ depends_on:
+ kafka:
+ condition: service_healthy
+ env_file:
+ - ./env/common_python.env
+ volumes:
+ - "./conf/graph.yml:/graph.yml:ro"
+ - "./services/swh-graph/entrypoint.sh:/entrypoint.sh:ro"
+ - "../swh-graph:/src/swh-graph"
+ - "../swh-dataset:/src/swh-dataset"
+
+
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -19,12 +19,12 @@
env_file: ./env/kafka.env
environment:
KAFKA_CREATE_TOPICS: swh.journal.objects.content:1:1,
- swh.journal.objects.origin:1:1,
- swh.journal.objects.origin_visit:1:1,
- swh.journal.objects.origin_visit_status:1:1,
- swh.journal.objects.revision:1:1,
- swh.journal.objects.release:1:1,
- swh.journal.indexed.origin_intrinsic_metadata:1:1
+ swh.journal.objects.origin:16:1,
+ swh.journal.objects.origin_visit:16:1,
+ swh.journal.objects.origin_visit_status:16:1,
+ swh.journal.objects.revision:16:1,
+ swh.journal.objects.release:16:1,
+ swh.journal.indexed.origin_intrinsic_metadata:16:1
depends_on:
- zookeeper
healthcheck:
diff --git a/docker/services/swh-graph/entrypoint.sh b/docker/services/swh-graph/entrypoint.sh
new file mode 100755
--- /dev/null
+++ b/docker/services/swh-graph/entrypoint.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+
+source /srv/softwareheritage/utils/pyutils.sh
+setup_pip
+
+DATADIR=/srv/softwareheritage/graph
+
+case "$1" in
+ "shell")
+ exec bash -i
+ ;;
+ *)
+ mkdir -p $DATADIR/g/
+ echo "Exporting edges and nodes"
+ PYTHONUNBUFFERED=1 swh dataset -C $SWH_CONFIG_FILENAME graph export $DATADIR/g --processes=4
+ echo "Sorting edges and nodes"
+ PYTHONUNBUFFERED=1 swh dataset graph sort $DATADIR/g/edges/
+ echo "Compressing graph"
+ swh graph compress --graph $DATADIR/g/edges/graph --outdir $DATADIR/compressed
+ echo "Starting the swh-graph API server"
+ exec gunicorn --bind 0.0.0.0:5009 \
+ --worker-class aiohttp.worker.GunicornWebWorker \
+ --reload \
+ --threads 4 \
+ --workers 2 \
+ --log-level DEBUG \
+ --timeout 3600 \
+ --config 'python:swh.core.api.gunicorn_config' \
+ 'swh.graph.server.app:make_app_from_configfile()'
+ ;;
+esac

File Metadata

Mime Type
text/plain
Expires
Wed, Jul 2, 10:51 AM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230002

Event Timeline