diff --git a/tools/git2graph/Makefile b/tools/git2graph/Makefile index 2e19cff..958ea0d 100644 --- a/tools/git2graph/Makefile +++ b/tools/git2graph/Makefile @@ -1,17 +1,25 @@ CC = gcc LD = gcc +BATS = bats + LIBS = libgit2 CFLAGS = -Wall -Werror $(shell pkg-config --cflags $(LIBS)) LDFLAGS = $(shell pkg-config --libs $(LIBS)) +BATS_FLAGS = all: git2graph git2graph: git2graph.o $(LD) -o $@ $^ $(CFLAGS) $(LDFLAGS) %.o: %.c $(CC) $(CFLAGS) -c $< +test: all + $(BATS) $(BATS_FLAGS) tests/ + clean: rm -f *.o git2graph + +.PHONY: all clean test diff --git a/tools/git2graph/README.md b/tools/git2graph/README.md index 7e8202a..f4bd4c9 100644 --- a/tools/git2graph/README.md +++ b/tools/git2graph/README.md @@ -1,43 +1,55 @@ git2graph ========= `git2graph` crawls a Git repository and outputs it as a graph, i.e., as a pair of textual files . The nodes file will contain a list of graph nodes as Software Heritage (SWH) Persistent Identifiers (PIDs); the edges file a list of graph edges as PID pairs. +Dependencies +------------ + +Build time dependencies: + +- [libgit2](https://libgit2.org/) + +Test dependencies: + +- [bats](https://github.com/bats-core/bats-core) + + Micro benchmark --------------- $ time ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz) ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz 243,30s user 17,28s system 89% cpu 4:51,53 total $ zcat nodes.csv.gz | wc -l 6503402 $ zcat edges.csv.gz | wc -l 305095437 Parallel use ------------ `git2graph` writes fixed-length lines, long either 51 bytes (nodes) or 102 bytes (edges). When writing to a FIFO less than `PIPE_BUF` bytes (which is 4096 bytes on Linux, and guaranteed to be at least 512 bytes by POSIX), writes are atomic. Hence it is possible to mass analyze many repositories in parallel with something like: $ mkfifo nodes.fifo edges.fifo $ sort -u < nodes.fifo | pigz -c > nodes.csv.gz & $ sort -u < edges.fifo | pigz -c > edges.csv.gz & $ parallel -i git2graph '{}' nodes.fifo edges.fifo -- repo_dir_1 repo_dir_2 ... $ rm nodes.fifo edges.fifo Note that you most likely want to tune `sort` in order to be parallel (`--parallel`), use a large buffer size (`-S`), and use a temporary directory with enough available space (`-T`). (The above example uses `parallel` from [moreutils](https://joeyh.name/code/moreutils/), but it could trivially be adapted to use [GNU parallel](https://www.gnu.org/software/parallel/) or similar parallelization tools.) diff --git a/tools/git2graph/tests/data/graphs/full/edges.csv b/tools/git2graph/tests/data/graphs/full/edges.csv new file mode 100644 index 0000000..43c81d2 --- /dev/null +++ b/tools/git2graph/tests/data/graphs/full/edges.csv @@ -0,0 +1,32 @@ +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca +swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 +swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 diff --git a/tools/git2graph/tests/data/graphs/full/nodes.csv b/tools/git2graph/tests/data/graphs/full/nodes.csv new file mode 100644 index 0000000..ea100d1 --- /dev/null +++ b/tools/git2graph/tests/data/graphs/full/nodes.csv @@ -0,0 +1,22 @@ +swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 +swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 +swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca +swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 +swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 +swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 +swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 +swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 diff --git a/tools/git2graph/tests/data/sample-repo.tgz b/tools/git2graph/tests/data/sample-repo.tgz new file mode 100644 index 0000000..5b5baa7 Binary files /dev/null and b/tools/git2graph/tests/data/sample-repo.tgz differ diff --git a/tools/git2graph/tests/full-graph.bats b/tools/git2graph/tests/full-graph.bats new file mode 100644 index 0000000..1f9d5c4 --- /dev/null +++ b/tools/git2graph/tests/full-graph.bats @@ -0,0 +1,8 @@ +#!/usr/bin/env bats + +load repo_helper + +@test "export entire graph" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" + assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR} +} diff --git a/tools/git2graph/tests/repo_helper.bash b/tools/git2graph/tests/repo_helper.bash new file mode 100644 index 0000000..c6ed284 --- /dev/null +++ b/tools/git2graph/tests/repo_helper.bash @@ -0,0 +1,39 @@ + +DATA_DIR="${BATS_TEST_DIRNAME}/data" +TEST_REPO_TGZ="${DATA_DIR}/sample-repo.tgz" + +setup () { + TEST_TMPDIR=$(mktemp -td swh-graph-test.XXXXXXXXXX) + (cd "$TEST_TMPDIR" ; tar xaf "$TEST_REPO_TGZ") + TEST_REPO_DIR="${TEST_TMPDIR}/sample-repo" +} + +teardown () { + rm -rf "$TEST_TMPDIR" +} + +# Invoke git2graph (SUT) on the given repo_dir and store its results in the CSV +# files nodes.csv and edges.csv located under the given dest_dir. +run_git2graph () { + repo_dir="$1" + dest_dir="$2" + nodes_file="${dest_dir}/nodes.csv" + edges_file="${dest_dir}/edges.csv" + + if [ ! -d "$dest_dir" ] ; then + mkdir -p "$dest_dir" + fi + + ./git2graph "$repo_dir" >(sort > "$nodes_file") >(sort > "$edges_file") +} + +# Ensure that two graphs, each specified as a dir that should contain a pair of +# sorted, textual files called nodes.csv and edges.csv. Comparison is done +# using diff. +assert_equal_graphs () { + dir_1="$1" + dir_2="$2" + + diff "${dir_1}/nodes.csv" "${dir_2}/nodes.csv" && + diff "${dir_1}/edges.csv" "${dir_2}/edges.csv" +}