diff --git a/tools/git2graph/Makefile b/tools/git2graph/Makefile index 2b804f7..b6671ea 100644 --- a/tools/git2graph/Makefile +++ b/tools/git2graph/Makefile @@ -1,25 +1,25 @@ CC = gcc LD = gcc BATS = bats LIBS = libgit2 glib-2.0 -CFLAGS = -Wall -Werror $(shell pkg-config --cflags $(LIBS)) -g +CFLAGS = -Wall -Werror $(shell pkg-config --cflags $(LIBS)) LDFLAGS = $(shell pkg-config --libs $(LIBS)) BATS_FLAGS = all: git2graph git2graph: git2graph.o $(LD) -o $@ $^ $(CFLAGS) $(LDFLAGS) %.o: %.c $(CC) $(CFLAGS) -c $< test: all $(BATS) $(BATS_FLAGS) tests/ clean: rm -f *.o git2graph .PHONY: all clean test diff --git a/tools/git2graph/README.md b/tools/git2graph/README.md index 840a06d..30fd648 100644 --- a/tools/git2graph/README.md +++ b/tools/git2graph/README.md @@ -1,55 +1,65 @@ git2graph ========= `git2graph` crawls a Git repository and outputs it as a graph, i.e., as a pair of textual files . The nodes file will contain a list of graph nodes as Software Heritage (SWH) Persistent Identifiers (PIDs); the edges file a list of graph edges as PID pairs. +Nodes file +---------- + +`git2graph` outputs a textual edges file. If you also need a *nodes* file, with +one PID per line, you can postprocess the edges files as follows: + + $ git2graph REPO_DIR > edges.csv + $ sort -u < edges.csv > nodes.csv + + Dependencies ------------ Build time dependencies: - [libgit2](https://libgit2.org/) Test dependencies: - [bats](https://github.com/bats-core/bats-core) Micro benchmark --------------- - $ time ./git2graph -n >(pigz -c > nodes.csv.gz) -e >(pigz -c > edges.csv.gz) /srv/src/linux - ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz 243,30s user 17,28s system 89% cpu 4:51,53 total - - $ zcat nodes.csv.gz | wc -l - 6503402 + $ time ./git2graph -o >(pigz -c > edges.csv.gz) /srv/src/linux + ./git2graph -o >(pigz -c > edges.csv.gz /srv/src/linux 243,30s user 17,28s system 89% cpu 4:51,53 total $ zcat edges.csv.gz | wc -l 305095437 + + $ zcat edges.csv.gz | tr ' ' '\n' | sort -u | pigz -c > nodes.csv.gz + $ zcat nodes.csv.gz | wc -l + 6503402 Parallel use ------------ `git2graph` writes fixed-length lines, long either 51 bytes (nodes) or 102 bytes (edges). When writing to a FIFO less than `PIPE_BUF` bytes (which is 4096 bytes on Linux, and guaranteed to be at least 512 bytes by POSIX), writes are atomic. Hence it is possible to mass analyze many repositories in parallel with something like: - $ mkfifo nodes.fifo edges.fifo - $ sort -u < nodes.fifo | pigz -c > nodes.csv.gz & + $ mkfifo edges.fifo $ sort -u < edges.fifo | pigz -c > edges.csv.gz & - $ parallel git2graph -n nodes.fifo -e edges.fifo -- repo_dir_1 repo_dir_2 ... - $ rm nodes.fifo edges.fifo + $ parallel git2graph -o edges.fifo -- repo_dir_1 repo_dir_2 ... + $ rm edges.fifo Note that you most likely want to tune `sort` in order to be parallel (`--parallel`), use a large buffer size (`-S`), and use a temporary directory with enough available space (`-T`). (The above example uses `parallel` from [moreutils](https://joeyh.name/code/moreutils/), but it could trivially be adapted to use [GNU parallel](https://www.gnu.org/software/parallel/) or similar parallelization tools.) diff --git a/tools/git2graph/git2graph.c b/tools/git2graph/git2graph.c index 45dcd56..e172606 100644 --- a/tools/git2graph/git2graph.c +++ b/tools/git2graph/git2graph.c @@ -1,514 +1,478 @@ /* * Copyright (C) 2019 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information */ -/* Crawl a Git repository and output it as a graph, i.e., as a pair of textual - * files . The nodes file will contain a list of graph nodes as - * Software Heritage (SWH) Persistent Identifiers (PIDs); the edges file a list - * of graph edges as PID pairs. +/* Crawl a Git repository and output it as a graph, i.e., as textual file + * containing a list of graph edges, one per line. Each edge is a + * pair of Software Heritage (SWH) Persistent Identifiers (PIDs). */ #include #include #include #include #include #include #include #include #include #include #define SWH_PREFIX "swh:1" #define SWH_DIR "swh:1:dir" #define SWH_REV "swh:1:rev" #define SWH_PIDSZ (GIT_OID_HEXSZ + 10) // size of a SWH PID -// line-lengths in nodes and edges file -#define NODES_LINELEN (SWH_PIDSZ + 1) +// length of a textual edge line #define EDGES_LINELEN (SWH_PIDSZ * 2 + 2) -// Output buffer sizes for nodes and edges files. To guarantee atomic and -// non-interleaved writes (which matter when used concurrently writing to a -// shared FIFO), these sizes must be <= PIPE_BUF and multiples of -// {NODES,EDGES}_LINELEN. -#define NODES_OUTSZ ((PIPE_BUF / NODES_LINELEN) * NODES_LINELEN) +// Output buffer size for edges files. To guarantee atomic and non-interleaved +// writes (which matter when used concurrently writing to a shared FIFO), size +// must be <= PIPE_BUF and a multiple of EDGES_LINELEN. #define EDGES_OUTSZ ((PIPE_BUF / EDGES_LINELEN) * EDGES_LINELEN) // GIT_OBJ_* constants extension for non-git objects #define SWH_OBJ_SNP 5 // snapshots (swh:1:snp:...) #define SWH_OBJ_ORI 6 // origins (swh:1:ori:...) #define SWH_OBJ_LOC 7 // lines of code (swh:1:loc:...) #define OBJ_TYPES 8 #define ELT_SEP "," // element separator in lists #define PAIR_SEP ":" // key/value separator in paris /* map from libgit2's git_otype (+ SWH-specific types above) to SWH PID type * qualifiers */ static char *_git_otype2swh[OBJ_TYPES] = { "*", // 0 == GIT_OBJ__EXT1 (unused in libgit2, used as wildcard here) "rev", // 1 == GIT_OBJ_COMMIT "dir", // 2 == GIT_OBJ_TREE "cnt", // 3 == GIT_OBJ_BLOB "rel", // 4 == GIT_OBJ_TAG "snp", // 5 == SWH_OBJ_SNP "ori", // 6 == SWH_OBJ_ORI "loc", // 7 == SWH_OBJ_LOC }; #define GIT_OBJ_ANY GIT_OBJ__EXT1 /* Convert a git object type (+ SWH-specific types above) to the corresponding * SWH PID type. */ #define git_otype2swh(type) _git_otype2swh[(type)] /* Parse object type (libgit's + SWH-specific types) from 3-letter type * qualifiers. Return either object type, or 0 in case of "*" wildcard, or -1 * in case of parse error. */ int parse_otype(char *str) { for (int i = 0; i < OBJ_TYPES; i++) { if (strcmp(str, _git_otype2swh[i]) == 0) return i; } return -1; } /* Allowed edge types matrix. Each cell denotes whether edges from a given * SRC_TYPE to a given DST_TYPE should be produced or not. */ static int _allowed_edges[OBJ_TYPES][OBJ_TYPES] = { // TO rev dir cnt rel snp ori loc | // ---------------------------------------------------------------- {true, true, true, true, true, true, true, true}, // | FROM {true, true, true, true, true, true, true, true}, // | rev {true, true, true, true, true, true, true, true}, // | dir {true, true, true, true, true, true, true, true}, // | cnt {true, true, true, true, true, true, true, true}, // | rel {true, true, true, true, true, true, true, true}, // | snp {true, true, true, true, true, true, true, true}, // | ori {true, true, true, true, true, true, true, true}, // | loc }; -/* Allowed node type vector. */ +/* Whether a nore type is allowed as *origin* for edges. Derived information + * from the _allowed_edges matrix. */ static int _allowed_nodes[OBJ_TYPES] = { - true, // - true, // rev - true, // dir - true, // cnt - true, // rel - true, // snp - true, // ori - true, // loc + true, // + true, // rev + true, // dir + true, // cnt + true, // rel + true, // snp + true, // ori + true, // loc }; #define is_edge_allowed(src_type, dst_type) _allowed_edges[(src_type)][(dst_type)] #define is_node_allowed(type) _allowed_nodes[(type)] + /* extra payload for callback invoked on Git objects */ typedef struct { git_odb *odb; // Git object DB git_repository *repo; // Git repository - FILE *nodes_out; // stream to write nodes to, or NULL FILE *edges_out; // stream to write edges to, or NULL } cb_payload; /* Invoke a libgit2 method and exits with an error message in case of * failure. * * Reused from libgit2 examples, specifically common.c, available under CC0. */ void check_lg2(int error, const char *message, const char *extra) { const git_error *lg2err; const char *lg2msg = "", *lg2spacer = ""; if (!error) return; if ((lg2err = giterr_last()) != NULL && lg2err->message != NULL) { lg2msg = lg2err->message; lg2spacer = " - "; } if (extra) fprintf(stderr, "%s '%s' [%d]%s%s\n", message, extra, error, lg2spacer, lg2msg); else fprintf(stderr, "%s [%d]%s%s\n", message, error, lg2spacer, lg2msg); exit(1); } -/* Compute allowed node types base on allowed edge types, which is a sane - * default. The result should be overridden in case one wants to output - * specific nodes, but not their outgoing edges. */ +/* Compute allowed node types based on allowed edge types. */ void init_allowed_nodes_from_edges( int allowed_edges[OBJ_TYPES][OBJ_TYPES], int allowed_nodes[OBJ_TYPES]) { for (int i = 0; i < OBJ_TYPES; i++) { allowed_nodes[i] = false; // disallowed by default - // allowed if either a edge can originate from it... + // allowed if an edge can originate from it... for (int src_type = 0; src_type < OBJ_TYPES; src_type++) - allowed_nodes[i] = allowed_nodes[i] \ + allowed_nodes[i] = allowed_nodes[i] \ || allowed_edges[src_type][i]; // ...or lead to it for (int dst_type = 0; dst_type < OBJ_TYPES; dst_type++) - allowed_nodes[i] = allowed_nodes[i] \ + allowed_nodes[i] = allowed_nodes[i] \ || allowed_edges[i][dst_type]; } } /* Emit commit edges. */ void emit_commit_edges(const git_commit *commit, const char *swhpid, FILE *out) { unsigned int i, max_i; char oidstr[GIT_OID_HEXSZ + 1]; // to PID // rev -> dir if (is_edge_allowed(GIT_OBJ_COMMIT, GIT_OBJ_TREE)) { git_oid_tostr(oidstr, sizeof(oidstr), git_commit_tree_id(commit)); fprintf(out, "%s %s:%s\n", swhpid, SWH_DIR, oidstr); } // rev -> rev if (is_edge_allowed(GIT_OBJ_COMMIT, GIT_OBJ_COMMIT)) { max_i = (unsigned int)git_commit_parentcount(commit); for (i = 0; i < max_i; ++i) { git_oid_tostr(oidstr, sizeof(oidstr), git_commit_parent_id(commit, i)); fprintf(out, "%s %s:%s\n", swhpid, SWH_REV, oidstr); } } } /* Emit tag edges. */ void emit_tag_edges(const git_tag *tag, const char *swhpid, FILE *out) { char oidstr[GIT_OID_HEXSZ + 1]; int target_type; // rel -> * target_type = git_tag_target_type(tag); if (is_edge_allowed(GIT_OBJ_TAG, target_type)) { git_oid_tostr(oidstr, sizeof(oidstr), git_tag_target_id(tag)); fprintf(out, "%s %s:%s:%s\n", swhpid, SWH_PREFIX, git_otype2swh(target_type), oidstr); } } /* Emit tree edges. */ void emit_tree_edges(const git_tree *tree, const char *swhpid, FILE *out) { size_t i, max_i = (int)git_tree_entrycount(tree); char oidstr[GIT_OID_HEXSZ + 1]; const git_tree_entry *te; int entry_type; // dir -> * for (i = 0; i < max_i; ++i) { te = git_tree_entry_byindex(tree, i); entry_type = git_tree_entry_type(te); if (is_edge_allowed(GIT_OBJ_TREE, entry_type)) { git_oid_tostr(oidstr, sizeof(oidstr), git_tree_entry_id(te)); fprintf(out, "%s %s:%s:%s\n", swhpid, SWH_PREFIX, git_otype2swh(entry_type), oidstr); } } } -/* Emit node and edges for current object. */ +/* Emit edges for current object. */ int emit_obj(const git_oid *id, void *payload) { char oidstr[GIT_OID_HEXSZ + 1]; char swhpid[SWH_PIDSZ + 1]; size_t len; int obj_type; git_commit *commit; git_tag *tag; git_tree *tree; git_odb *odb = ((cb_payload *) payload)->odb; git_repository *repo = ((cb_payload *) payload)->repo; - FILE *nodes_out = ((cb_payload *) payload)->nodes_out; FILE *edges_out = ((cb_payload *) payload)->edges_out; check_lg2(git_odb_read_header(&len, &obj_type, odb, id), "cannot read object header", NULL); - if (!is_node_allowed(obj_type)) // no outbound edges allowed, skip node + if (!is_node_allowed(obj_type)) return 0; - // emit node + // format node PID sprintf(swhpid, "swh:1:%s:", git_otype2swh(obj_type)); git_oid_tostr(swhpid + 10, sizeof(oidstr), id); - if (nodes_out != NULL) - fprintf(nodes_out, "%s\n", swhpid); // emit edges - if (edges_out != NULL) { - switch (obj_type) { - case GIT_OBJ_BLOB: // graph leaf: no edges to emit - break; - case GIT_OBJ_COMMIT: - check_lg2(git_commit_lookup(&commit, repo, id), - "cannot find commit", NULL); - emit_commit_edges(commit, swhpid, edges_out); - git_commit_free(commit); - break; - case GIT_OBJ_TAG: - check_lg2(git_tag_lookup(&tag, repo, id), - "cannot find tag", NULL); - emit_tag_edges(tag, swhpid, edges_out); - git_tag_free(tag); - break; - case GIT_OBJ_TREE: - check_lg2(git_tree_lookup(&tree, repo, id), - "cannot find tree", NULL); - emit_tree_edges(tree, swhpid, edges_out); - git_tree_free(tree); - break; - default: - git_oid_tostr(oidstr, sizeof(oidstr), id); - fprintf(stderr, "ignoring unknown object: %s\n", oidstr); - break; - } + switch (obj_type) { + case GIT_OBJ_BLOB: // graph leaf: no edges to emit + break; + case GIT_OBJ_COMMIT: + check_lg2(git_commit_lookup(&commit, repo, id), + "cannot find commit", NULL); + emit_commit_edges(commit, swhpid, edges_out); + git_commit_free(commit); + break; + case GIT_OBJ_TAG: + check_lg2(git_tag_lookup(&tag, repo, id), + "cannot find tag", NULL); + emit_tag_edges(tag, swhpid, edges_out); + git_tag_free(tag); + break; + case GIT_OBJ_TREE: + check_lg2(git_tree_lookup(&tree, repo, id), + "cannot find tree", NULL); + emit_tree_edges(tree, swhpid, edges_out); + git_tree_free(tree); + break; + default: + git_oid_tostr(oidstr, sizeof(oidstr), id); + fprintf(stderr, "ignoring unknown object: %s\n", oidstr); + break; } return 0; } void exit_usage(char *msg) { if (msg != NULL) fprintf(stderr, "Error: %s\n\n", msg); fprintf(stderr, "Usage: git2graph [OPTION..] GIT_REPO_DIR\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -e, --edges-file=PATH file where to store edges\n"); - fprintf(stderr, " -n, --nodes-file=PATH file where to store nodes\n"); + fprintf(stderr, " -o, --output=PATH output file, default to stdout\n"); fprintf(stderr, " -E, --edges-filter=EDGES_EXPR only emit selected edges\n"); - fprintf(stderr, " -N, --nodes-filter=NODES_EXPR only emit selected nodes\n"); fprintf(stderr, "\n"); fprintf(stderr, "EDGES_EXPR is a comma separate list of src_TYPE:dst_TYPE pairs\n"); - fprintf(stderr, "NODES_EXPR is a comme separate list of node TYPEs\n"); - fprintf(stderr, "TYPE is one of: cnt, dir, loc, ori, rel, rev, snp\n"); + fprintf(stderr, "TYPE is one of: cnt, dir, loc, ori, rel, rev, snp, *\n"); fprintf(stderr, "\nNote: you can use \"-\" for stdout in file names.\n"); + exit(EXIT_FAILURE); } /* command line arguments */ typedef struct { - char *nodes_path; - char *edges_path; - char *nodes_filter; + char *outfile; char *edges_filter; char *repo_dir; } cli_args; cli_args *parse_cli(int argc, char **argv) { int opt; cli_args *args = malloc(sizeof(cli_args)); if (args == NULL) { perror("Cannot allocate memory."); exit(EXIT_FAILURE); } else { - args->nodes_path = NULL; - args->edges_path = NULL; - args->nodes_filter = NULL; + args->outfile = NULL; args->edges_filter = NULL; args->repo_dir = NULL; } static struct option long_opts[] = { - {"edges-file", required_argument, 0, 'e' }, - {"nodes-file", required_argument, 0, 'n' }, {"edges-filter", required_argument, 0, 'E' }, - {"nodes-filter", required_argument, 0, 'N' }, + {"output", required_argument, 0, 'o' }, {"help", no_argument, 0, 'h' }, {0, 0, 0, 0 } }; - while ((opt = getopt_long(argc, argv, "e:n:E:N:h", long_opts, + while ((opt = getopt_long(argc, argv, "E:o:h", long_opts, NULL)) != -1) { switch (opt) { - case 'e': args->edges_path = optarg; break; - case 'n': args->nodes_path = optarg; break; case 'E': args->edges_filter = optarg; break; - case 'N': args->nodes_filter = optarg; break; + case 'o': args->outfile = optarg; break; case 'h': default: exit_usage(NULL); } } if (argv[optind] == NULL) exit_usage(NULL); args->repo_dir = argv[optind]; + if (args->outfile == NULL) + args->outfile = "-"; + return args; } /* open output stream specified on the command line (if at all) */ FILE *open_out_stream(char *cli_path, char *buf, int bufsiz) { FILE *stream; if (cli_path == NULL) stream = NULL; else if (strcmp(cli_path, "-") == 0) stream = stdout; else if((stream = fopen(cli_path, "w")) == NULL) { fprintf(stderr, "can't open file: %s\n", cli_path); exit(EXIT_FAILURE); } // ensure atomic and non-interleaved writes if (stream != NULL) setvbuf(stream, buf, _IOFBF, bufsiz); return stream; } void fill_matrix(int matrix[OBJ_TYPES][OBJ_TYPES], int val) { for (int i = 0; i < OBJ_TYPES; i++) for (int j = 0; j < OBJ_TYPES; j++) matrix[i][j] = val; } void fill_row(int matrix[OBJ_TYPES][OBJ_TYPES], int row, int val) { for (int j = 0; j < OBJ_TYPES; j++) matrix[row][j] = val; } void fill_column(int matrix[OBJ_TYPES][OBJ_TYPES], int col, int val) { for (int i = 0; i < OBJ_TYPES; i++) matrix[i][col] = val; } void fill_vector(int vector[OBJ_TYPES], int val) { for (int i = 0; i < OBJ_TYPES; i++) vector[i] = val; } /* Dump node/edge filters to a given stream. For debugging purposes. */ void _dump_filters(FILE *out, int matrix[OBJ_TYPES][OBJ_TYPES], int vector[OBJ_TYPES]) { fprintf(out, "TO rev dir cnt rel snp ori loc FROM\n"); for(int i = 0; i < OBJ_TYPES; i++) { for(int j = 0; j < OBJ_TYPES; j++) fprintf(out, "%d ", matrix[i][j]); fprintf(out, "%s\n", _git_otype2swh[i]); } fprintf(out, " rev dir cnt rel snp ori loc\n"); for (int i = 0; i < OBJ_TYPES; i++) fprintf(out, "%d ", vector[i]); } /* set up nodes and edges restrictions, interpreting command line filters */ -void init_graph_filters(char *nodes_filter, char *edges_filter) { +void init_graph_filters(char *edges_filter) { char **filters; char **types; char **ptr; int src_type, dst_type; if (edges_filter != NULL) { fill_matrix(_allowed_edges, false); // nothing allowed by default filters = g_strsplit(edges_filter, ELT_SEP, -1); // "typ:typ" pairs for (ptr = filters; *ptr; ptr++) { types = g_strsplit(*ptr, PAIR_SEP, 2); // 2 "typ" fragments src_type = parse_otype(types[0]); dst_type = parse_otype(types[1]); if (src_type == GIT_OBJ_ANY && dst_type == GIT_OBJ_ANY) { // "*:*" wildcard fill_matrix(_allowed_edges, true); break; // all edges allowed already } else if (src_type == GIT_OBJ_ANY) { // "*:typ" wildcard fill_column(_allowed_edges, dst_type, true); } else if (dst_type == GIT_OBJ_ANY) { // "typ:*" wildcard fill_row(_allowed_edges, src_type, true); } else // "src_type:dst_type" _allowed_edges[src_type][dst_type] = true; g_strfreev(types); } g_strfreev(filters); } - if (nodes_filter != NULL) { - fill_vector(_allowed_nodes, false); // nothing allowed by default - filters = g_strsplit(nodes_filter, ELT_SEP, -1); // "typ" fragments - for (ptr = filters; *ptr; ptr++) { - src_type = parse_otype(*ptr); - if (src_type == GIT_OBJ_ANY) { // "*" wildcard - fill_vector(_allowed_nodes, true); - break; // all nodes allowed already - } else - _allowed_nodes[src_type] = true; - } - g_strfreev(filters); - } else { // no explicit node filtering request, derive allowed nodes - // from allowed edges - init_allowed_nodes_from_edges(_allowed_edges, _allowed_nodes); - } + init_allowed_nodes_from_edges(_allowed_edges, _allowed_nodes); } int main(int argc, char **argv) { git_repository *repo; git_odb *odb; int rc; cli_args *args; cb_payload *payload; - FILE *nodes_out, *edges_out; - char nodes_buf[NODES_OUTSZ]; + FILE *edges_out; char edges_buf[EDGES_OUTSZ]; args = parse_cli(argc, argv); - init_graph_filters(args->nodes_filter, args->edges_filter); + init_graph_filters(args->edges_filter); // _dump_filters(stdout, _allowed_edges, _allowed_nodes); git_libgit2_init(); check_lg2(git_repository_open(&repo, args->repo_dir), "cannot open repository", NULL); check_lg2(git_repository_odb(&odb, repo), "cannot get object DB", NULL); - nodes_out = open_out_stream(args->nodes_path, nodes_buf, NODES_OUTSZ); - edges_out = open_out_stream(args->edges_path, edges_buf, EDGES_OUTSZ); - assert(NODES_OUTSZ <= PIPE_BUF && (NODES_OUTSZ % NODES_LINELEN == 0)); + edges_out = open_out_stream(args->outfile, edges_buf, EDGES_OUTSZ); assert(EDGES_OUTSZ <= PIPE_BUF && (EDGES_OUTSZ % EDGES_LINELEN == 0)); payload = malloc(sizeof(cb_payload)); payload->odb = odb; payload->repo = repo; - payload->nodes_out = nodes_out; payload->edges_out = edges_out; rc = git_odb_foreach(odb, emit_obj, payload); check_lg2(rc, "failure during object iteration", NULL); git_odb_free(odb); git_repository_free(repo); free(payload); exit(rc); } diff --git a/tools/git2graph/tests/data/graphs/directories/nodes.csv b/tools/git2graph/tests/data/graphs/directories/nodes.csv deleted file mode 100644 index 505096b..0000000 --- a/tools/git2graph/tests/data/graphs/directories/nodes.csv +++ /dev/null @@ -1,13 +0,0 @@ -swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 -swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 -swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 -swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 -swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca -swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 -swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 -swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 -swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 -swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 -swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c -swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c -swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 diff --git a/tools/git2graph/tests/data/graphs/full/nodes.csv b/tools/git2graph/tests/data/graphs/full/nodes.csv deleted file mode 100644 index ea100d1..0000000 --- a/tools/git2graph/tests/data/graphs/full/nodes.csv +++ /dev/null @@ -1,22 +0,0 @@ -swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 -swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 -swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 -swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 -swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca -swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 -swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 -swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 -swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 -swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 -swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c -swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c -swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 -swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 -swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 -swh:1:rev:20cca959bae94594f60450f339b408581f1b401f -swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 -swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 -swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 -swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 -swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 -swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 diff --git a/tools/git2graph/tests/data/graphs/releases/nodes.csv b/tools/git2graph/tests/data/graphs/releases/nodes.csv deleted file mode 100644 index 5e13ab1..0000000 --- a/tools/git2graph/tests/data/graphs/releases/nodes.csv +++ /dev/null @@ -1,4 +0,0 @@ -swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 -swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 -swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 -swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 diff --git a/tools/git2graph/tests/data/graphs/revisions/nodes.csv b/tools/git2graph/tests/data/graphs/revisions/nodes.csv deleted file mode 100644 index e324159..0000000 --- a/tools/git2graph/tests/data/graphs/revisions/nodes.csv +++ /dev/null @@ -1,7 +0,0 @@ -swh:1:rev:20cca959bae94594f60450f339b408581f1b401f -swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 -swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 -swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 -swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 -swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 -swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 diff --git a/tools/git2graph/tests/data/graphs/to-revisions/edges.csv b/tools/git2graph/tests/data/graphs/to-revisions/edges.csv new file mode 100644 index 0000000..37d9047 --- /dev/null +++ b/tools/git2graph/tests/data/graphs/to-revisions/edges.csv @@ -0,0 +1,9 @@ +swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 diff --git a/tools/git2graph/tests/filters.bats b/tools/git2graph/tests/filters.bats index 6a010ce..8d99209 100644 --- a/tools/git2graph/tests/filters.bats +++ b/tools/git2graph/tests/filters.bats @@ -1,18 +1,23 @@ #!/usr/bin/env bats load repo_helper @test "export revisions" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E rev:rev -N rev + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E rev:rev assert_equal_graphs ${DATA_DIR}/graphs/revisions ${TEST_TMPDIR} } +@test "export edges with revision targets" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:rev" + assert_equal_graphs ${DATA_DIR}/graphs/to-revisions ${TEST_TMPDIR} +} + @test "export directories" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E dir:* -N cnt,dir + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "dir:*" assert_equal_graphs ${DATA_DIR}/graphs/directories ${TEST_TMPDIR} } @test "export releases" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E rel:* -N rel,rev + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "rel:*" assert_equal_graphs ${DATA_DIR}/graphs/releases ${TEST_TMPDIR} } diff --git a/tools/git2graph/tests/full-graph.bats b/tools/git2graph/tests/full-graph.bats index 1f9d5c4..8c93a94 100644 --- a/tools/git2graph/tests/full-graph.bats +++ b/tools/git2graph/tests/full-graph.bats @@ -1,8 +1,13 @@ #!/usr/bin/env bats load repo_helper @test "export entire graph" { run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR} } + +@test "export entire graph (using wildcard)" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:*" + assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR} +} diff --git a/tools/git2graph/tests/repo_helper.bash b/tools/git2graph/tests/repo_helper.bash index 0d2c5dc..36160b6 100644 --- a/tools/git2graph/tests/repo_helper.bash +++ b/tools/git2graph/tests/repo_helper.bash @@ -1,41 +1,40 @@ DATA_DIR="${BATS_TEST_DIRNAME}/data" TEST_REPO_TGZ="${DATA_DIR}/sample-repo.tgz" setup () { TEST_TMPDIR=$(mktemp -td swh-graph-test.XXXXXXXXXX) (cd "$TEST_TMPDIR" ; tar xaf "$TEST_REPO_TGZ") TEST_REPO_DIR="${TEST_TMPDIR}/sample-repo" } teardown () { rm -rf "$TEST_TMPDIR" } # Invoke git2graph (SUT) on the given repo_dir and store its results in the CSV # files nodes.csv and edges.csv located under the given dest_dir. run_git2graph () { repo_dir="$1" dest_dir="$2" shift 2 nodes_file="${dest_dir}/nodes.csv" edges_file="${dest_dir}/edges.csv" if [ ! -d "$dest_dir" ] ; then mkdir -p "$dest_dir" fi - ./git2graph "$@" -n >(sort > "$nodes_file") -e >(sort > "$edges_file") "$repo_dir" + ./git2graph "$@" "$repo_dir" | sort > "$edges_file" } # Ensure that two graphs, each specified as a dir that should contain a pair of # sorted, textual files called nodes.csv and edges.csv. Comparison is done # using diff. assert_equal_graphs () { dir_1="$1" dir_2="$2" - diff "${dir_1}/nodes.csv" "${dir_2}/nodes.csv" && diff "${dir_1}/edges.csv" "${dir_2}/edges.csv" }