diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 319c7ce..93c653c 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,37 +1,36 @@ -FROM openjdk:12 +FROM openjdk:13-slim-buster # Java global config ARG MAX_RAM=2800G -ENV JAVA_TOOL_OPTIONS \ - -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ - -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ - -XX:+UseTLAB -XX:+ResizeTLAB \ +ENV JAVA_TOOL_OPTIONS \ + -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ + -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ + -XX:+UseTLAB -XX:+ResizeTLAB \ -Dlogback.configurationFile=app/configuration/logback.xml -# Monitoring -RUN yum install -y time +# Install swh.graph (both Python and Java parts) +RUN apt-get update +RUN apt-get install --no-install-recommends --yes \ + curl time \ + gcc pkg-config libsystemd-dev python3-dev \ + python3-pip python3-setuptools +RUN pip3 install swh.graph -ARG WEBGRAPH_VERSION=3.5.1 -ARG LAW_VERSION=2.6.0 - -# Download third party binaries and dependencies +# Install 3rd party dependencies (not shipped with swh.graph) WORKDIR /srv/softwareheritage/graph/lib -RUN curl -O http://webgraph.di.unimi.it/webgraph-big-${WEBGRAPH_VERSION}-bin.tar.gz -RUN tar xvfz webgraph-big-${WEBGRAPH_VERSION}-bin.tar.gz -RUN cp webgraph-big-${WEBGRAPH_VERSION}/webgraph-big-${WEBGRAPH_VERSION}.jar . - -RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz -RUN tar xvfz webgraph-big-deps.tar.gz +RUN cp /usr/local/share/swh-graph/*.jar . +# law is not shipped via Maven, download from upstream +ARG LAW_VERSION=2.6.0 RUN curl -O http://law.di.unimi.it/software/download/law-${LAW_VERSION}-bin.tar.gz RUN tar xvfz law-${LAW_VERSION}-bin.tar.gz RUN cp law-${LAW_VERSION}/law-${LAW_VERSION}.jar . # Add user files WORKDIR /srv/softwareheritage/graph/app COPY configuration configuration/ COPY scripts scripts/ # Default dir WORKDIR /srv/softwareheritage/graph diff --git a/docs/docker.rst b/docs/docker.rst index 6e33249..c6739b2 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -1,79 +1,79 @@ Docker environment ================== Build ----- .. code:: bash $ git clone https://forge.softwareheritage.org/source/swh-graph.git $ cd swh-graph $ docker build --tag swh-graph dockerfiles Run --- Given a graph ``g`` specified by: - ``g.edges.csv.gz``: gzip-compressed csv file with one edge per line, as a "SRC_ID SPACE DST_ID" string, where identifiers are the :ref:`persistent-identifiers` of each node. - ``g.nodes.csv.gz``: sorted list of unique node identifiers appearing in the corresponding ``g.edges.csv.gz`` file. The format is a gzip-compressed csv file with one persistent identifier per line. .. code:: bash $ docker run -ti \ --volume /PATH/TO/GRAPH/:/srv/softwareheritage/graph/data \ --publish 127.0.0.1:5009:5009 \ swh-graph:latest \ bash Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.gz`` and ``g.nodes.csv.gz`` files. By default, when entering the container the current working directory will be ``/srv/softwareheritage/graph``; all relative paths found below are intended to be relative to that dir. Graph compression ~~~~~~~~~~~~~~~~~ To compress the graph: .. code:: bash $ app/scripts/compress_graph.sh --lib lib/ --input data/g Warning: very large graphs may need a bigger batch size parameter for WebGraph internals (you can specify a value when running the compression script using: ``--batch-size 1000000000``). Node identifier mappings ~~~~~~~~~~~~~~~~~~~~~~~~ To dump the mapping files (i.e., various node id <-> other info mapping files, in either ``.csv.gz`` or ad-hoc ``.map`` format): .. code:: bash - $ java -cp app/swh-graph.jar \ + $ java -cp lib/swh-graph-jar-with-dependencies.jar \ org.softwareheritage.graph.backend.Setup \ data/g.nodes.csv.gz data/compressed/g Graph server ~~~~~~~~~~~~ To start the swh-graph server: .. code:: bash - $ java -cp app/swh-graph.jar \ + $ java -cp lib/swh-graph-jar-with-dependencies.jar \ org.softwareheritage.graph.App data/compressed/g To specify the port on which the server will run, use the `--port` or `-p` flag (default is 5009). diff --git a/java/server/README.md b/java/server/README.md index baeb2a7..9e2e315 100644 --- a/java/server/README.md +++ b/java/server/README.md @@ -1,50 +1,50 @@ Graph service - Server side =========================== Server side Java REST API. Build ----- ```bash $ mvn compile assembly:single ``` Start REST API -------------- ```bash -$ java -cp target/swh-graph-0.0.2-jar-with-dependencies.jar \ - org.softwareheritage.graph.App \ +$ java -cp target/swh-graph-jar-with-dependencies.jar \ + org.softwareheritage.graph.App \ ``` Default port is 5009 (use the `--port` option to change port number). If you need timings metadata send back to the client in addition to the result, use the `--timings` flag. Tests ----- Unit tests rely on test data that are already available in the Git repository (under `src/test/dataset/`). You generally only need to run them using Maven: ```bash $ mvn test ``` In case you want to regenerate the test data: ```bash # Graph compression $ cd src/test/dataset $ ./generate_graph.sh $ cd ../../../ $ mvn compile assembly:single # Dump mapping files -$ java -cp target/swh-graph-0.0.2-jar-with-dependencies.jar \ - org.softwareheritage.graph.backend.Setup \ - src/test/dataset/example.nodes.csv.gz \ +$ java -cp target/swh-graph-jar-with-dependencies.jar \ + org.softwareheritage.graph.backend.Setup \ + src/test/dataset/example.nodes.csv.gz \ src/test/dataset/output/example ``` diff --git a/java/server/pom.xml b/java/server/pom.xml index f5fd58e..4edde5d 100644 --- a/java/server/pom.xml +++ b/java/server/pom.xml @@ -1,156 +1,182 @@ 4.0.0 org.softwareheritage.graph swh-graph - 0.0.2 + ${git.closest.tag.name} swh-graph https://www.softwareheritage.org/ UTF-8 11 ch.qos.logback logback-classic 1.2.3 junit junit 4.11 test org.hamcrest hamcrest 2.1 test io.javalin javalin 3.0.0 org.slf4j slf4j-simple 1.7.26 com.fasterxml.jackson.core jackson-databind 2.9.8 it.unimi.dsi webgraph-big 3.5.1 it.unimi.dsi fastutil 8.2.2 com.martiansoftware jsap 2.1 net.sf.py4j py4j 0.10.8.1 + swh-graph maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 -verbose -Xlint:all maven-surefire-plugin 2.22.1 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 maven-assembly-plugin org.softwareheritage.graph.App jar-with-dependencies make-assembly package single + + + pl.project13.maven + git-commit-id-plugin + 3.0.1 + + + get-the-git-infos + + revision + + initialize + + + + true + true + true + + true + v* + + + + org.apache.maven.plugins maven-javadoc-plugin 3.1.1 diff --git a/swh/graph/tests/__init__.py b/swh/graph/tests/__init__.py index 44e0c37..e69de29 100644 --- a/swh/graph/tests/__init__.py +++ b/swh/graph/tests/__init__.py @@ -1 +0,0 @@ -SWH_GRAPH_VERSION = '0.0.2'