diff --git a/tools/git2graph/README.md b/tools/git2graph/README.md index 2172a10..c27f95a 100644 --- a/tools/git2graph/README.md +++ b/tools/git2graph/README.md @@ -1,65 +1,54 @@ git2graph ========= `git2graph` crawls a Git repository and outputs it as a graph, i.e., as a pair of textual files . The nodes file will contain a list of graph nodes as Software Heritage (SWH) Persistent Identifiers (PIDs); the edges file a list of graph edges as PID pairs. Dependencies ------------ Build time dependencies: - [libgit2](https://libgit2.org/) Test dependencies: - [bats](https://github.com/bats-core/bats-core) -Nodes file ----------- - -`git2graph` outputs a textual edges file. If you also need a *nodes* file, with -one PID per line, you can postprocess the edges files as follows: - - $ git2graph REPO_DIR > edges.csv - $ sort -u < edges.csv > nodes.csv - - Micro benchmark --------------- - $ time ./git2graph -o >(pigz -c > edges.csv.gz) /srv/src/linux - ./git2graph -o >(pigz -c > edges.csv.gz) /srv/src/linux 232,06s user 16,24s system 90% cpu 4:35,52 total + $ time ./git2graph -n >(pigz -c > nodes.csv.gz) -e >(pigz -c > edges.csv.gz) /srv/src/linux + 232,06s user 16,24s system 90% cpu 4:40,35 total - $ zcat edges.csv.gz | wc -l - 305095437 - - $ zcat edges.csv.gz | tr ' ' '\n' | sort -u | pigz -c > nodes.csv.gz $ zcat nodes.csv.gz | wc -l 6503402 + $ zcat edges.csv.gz | wc -l + 305095437 Parallel use ------------ `git2graph` writes fixed-length lines, long either 51 bytes (nodes) or 102 bytes (edges). When writing to a FIFO less than `PIPE_BUF` bytes (which is 4096 bytes on Linux, and guaranteed to be at least 512 bytes by POSIX), writes are atomic. Hence it is possible to mass analyze many repositories in parallel with something like: - $ mkfifo edges.fifo + $ mkfifo nodes.fifo edges.fifo + $ sort -u < nodes.fifo | pigz -c > nodes.csv.gz & $ sort -u < edges.fifo | pigz -c > edges.csv.gz & - $ parallel git2graph -o edges.fifo -- repo_dir_1 repo_dir_2 ... - $ rm edges.fifo + $ parallel git2graph -n nodes.fifo -e edges.fifo -- repo_dir_1 repo_dir_2 ... + $ rm nodes.fifo edges.fifo Note that you most likely want to tune `sort` in order to be parallel (`--parallel`), use a large buffer size (`-S`), and use a temporary directory with enough available space (`-T`). (The above example uses `parallel` from [moreutils](https://joeyh.name/code/moreutils/), but it could trivially be adapted to use [GNU parallel](https://www.gnu.org/software/parallel/) or similar parallelization tools.) diff --git a/tools/git2graph/git2graph.c b/tools/git2graph/git2graph.c index e172606..87f0c41 100644 --- a/tools/git2graph/git2graph.c +++ b/tools/git2graph/git2graph.c @@ -1,478 +1,498 @@ /* * Copyright (C) 2019 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information */ -/* Crawl a Git repository and output it as a graph, i.e., as textual file - * containing a list of graph edges, one per line. Each edge is a - * pair of Software Heritage (SWH) Persistent Identifiers (PIDs). +/* Crawls a Git repository and outputs it as a graph, i.e., as a pair of + * textual files . The nodes file will contain a list of graph + * nodes as Software Heritage (SWH) Persistent Identifiers (PIDs); the edges + * file a list of graph edges as PID pairs. */ #include #include #include #include #include #include #include #include #include #include #define SWH_PREFIX "swh:1" #define SWH_DIR "swh:1:dir" #define SWH_REV "swh:1:rev" #define SWH_PIDSZ (GIT_OID_HEXSZ + 10) // size of a SWH PID -// length of a textual edge line +// line-lengths in nodes and edges file +#define NODES_LINELEN (SWH_PIDSZ + 1) #define EDGES_LINELEN (SWH_PIDSZ * 2 + 2) -// Output buffer size for edges files. To guarantee atomic and non-interleaved -// writes (which matter when used concurrently writing to a shared FIFO), size -// must be <= PIPE_BUF and a multiple of EDGES_LINELEN. +// Output buffer sizes for nodes and edges files. To guarantee atomic and +// non-interleaved writes (which matter when used concurrently writing to a +// shared FIFO), these sizes must be <= PIPE_BUF and multiples of +// {NODES,EDGES}_LINELEN. +#define NODES_OUTSZ ((PIPE_BUF / NODES_LINELEN) * NODES_LINELEN) #define EDGES_OUTSZ ((PIPE_BUF / EDGES_LINELEN) * EDGES_LINELEN) // GIT_OBJ_* constants extension for non-git objects #define SWH_OBJ_SNP 5 // snapshots (swh:1:snp:...) #define SWH_OBJ_ORI 6 // origins (swh:1:ori:...) #define SWH_OBJ_LOC 7 // lines of code (swh:1:loc:...) #define OBJ_TYPES 8 #define ELT_SEP "," // element separator in lists #define PAIR_SEP ":" // key/value separator in paris /* map from libgit2's git_otype (+ SWH-specific types above) to SWH PID type * qualifiers */ static char *_git_otype2swh[OBJ_TYPES] = { "*", // 0 == GIT_OBJ__EXT1 (unused in libgit2, used as wildcard here) "rev", // 1 == GIT_OBJ_COMMIT "dir", // 2 == GIT_OBJ_TREE "cnt", // 3 == GIT_OBJ_BLOB "rel", // 4 == GIT_OBJ_TAG "snp", // 5 == SWH_OBJ_SNP "ori", // 6 == SWH_OBJ_ORI "loc", // 7 == SWH_OBJ_LOC }; #define GIT_OBJ_ANY GIT_OBJ__EXT1 /* Convert a git object type (+ SWH-specific types above) to the corresponding * SWH PID type. */ #define git_otype2swh(type) _git_otype2swh[(type)] /* Parse object type (libgit's + SWH-specific types) from 3-letter type * qualifiers. Return either object type, or 0 in case of "*" wildcard, or -1 * in case of parse error. */ int parse_otype(char *str) { for (int i = 0; i < OBJ_TYPES; i++) { if (strcmp(str, _git_otype2swh[i]) == 0) return i; } return -1; } /* Allowed edge types matrix. Each cell denotes whether edges from a given * SRC_TYPE to a given DST_TYPE should be produced or not. */ static int _allowed_edges[OBJ_TYPES][OBJ_TYPES] = { // TO rev dir cnt rel snp ori loc | // ---------------------------------------------------------------- {true, true, true, true, true, true, true, true}, // | FROM {true, true, true, true, true, true, true, true}, // | rev {true, true, true, true, true, true, true, true}, // | dir {true, true, true, true, true, true, true, true}, // | cnt {true, true, true, true, true, true, true, true}, // | rel {true, true, true, true, true, true, true, true}, // | snp {true, true, true, true, true, true, true, true}, // | ori {true, true, true, true, true, true, true, true}, // | loc }; -/* Whether a nore type is allowed as *origin* for edges. Derived information - * from the _allowed_edges matrix. */ +/* Allowed node types vector. */ static int _allowed_nodes[OBJ_TYPES] = { true, // true, // rev true, // dir true, // cnt true, // rel true, // snp true, // ori true, // loc }; #define is_edge_allowed(src_type, dst_type) _allowed_edges[(src_type)][(dst_type)] #define is_node_allowed(type) _allowed_nodes[(type)] /* extra payload for callback invoked on Git objects */ typedef struct { git_odb *odb; // Git object DB git_repository *repo; // Git repository + FILE *nodes_out; // stream to write nodes to, or NULL FILE *edges_out; // stream to write edges to, or NULL } cb_payload; /* Invoke a libgit2 method and exits with an error message in case of * failure. * * Reused from libgit2 examples, specifically common.c, available under CC0. */ void check_lg2(int error, const char *message, const char *extra) { const git_error *lg2err; const char *lg2msg = "", *lg2spacer = ""; if (!error) return; if ((lg2err = giterr_last()) != NULL && lg2err->message != NULL) { lg2msg = lg2err->message; lg2spacer = " - "; } if (extra) fprintf(stderr, "%s '%s' [%d]%s%s\n", message, extra, error, lg2spacer, lg2msg); else fprintf(stderr, "%s [%d]%s%s\n", message, error, lg2spacer, lg2msg); exit(1); } -/* Compute allowed node types based on allowed edge types. */ -void init_allowed_nodes_from_edges( - int allowed_edges[OBJ_TYPES][OBJ_TYPES], - int allowed_nodes[OBJ_TYPES]) -{ - for (int i = 0; i < OBJ_TYPES; i++) { - allowed_nodes[i] = false; // disallowed by default - // allowed if an edge can originate from it... - for (int src_type = 0; src_type < OBJ_TYPES; src_type++) - allowed_nodes[i] = allowed_nodes[i] \ - || allowed_edges[src_type][i]; - // ...or lead to it - for (int dst_type = 0; dst_type < OBJ_TYPES; dst_type++) - allowed_nodes[i] = allowed_nodes[i] \ - || allowed_edges[i][dst_type]; - } -} - - /* Emit commit edges. */ void emit_commit_edges(const git_commit *commit, const char *swhpid, FILE *out) { unsigned int i, max_i; char oidstr[GIT_OID_HEXSZ + 1]; // to PID // rev -> dir if (is_edge_allowed(GIT_OBJ_COMMIT, GIT_OBJ_TREE)) { git_oid_tostr(oidstr, sizeof(oidstr), git_commit_tree_id(commit)); fprintf(out, "%s %s:%s\n", swhpid, SWH_DIR, oidstr); } // rev -> rev if (is_edge_allowed(GIT_OBJ_COMMIT, GIT_OBJ_COMMIT)) { max_i = (unsigned int)git_commit_parentcount(commit); for (i = 0; i < max_i; ++i) { git_oid_tostr(oidstr, sizeof(oidstr), git_commit_parent_id(commit, i)); fprintf(out, "%s %s:%s\n", swhpid, SWH_REV, oidstr); } } } /* Emit tag edges. */ void emit_tag_edges(const git_tag *tag, const char *swhpid, FILE *out) { char oidstr[GIT_OID_HEXSZ + 1]; int target_type; // rel -> * target_type = git_tag_target_type(tag); if (is_edge_allowed(GIT_OBJ_TAG, target_type)) { git_oid_tostr(oidstr, sizeof(oidstr), git_tag_target_id(tag)); fprintf(out, "%s %s:%s:%s\n", swhpid, SWH_PREFIX, git_otype2swh(target_type), oidstr); } } /* Emit tree edges. */ void emit_tree_edges(const git_tree *tree, const char *swhpid, FILE *out) { size_t i, max_i = (int)git_tree_entrycount(tree); char oidstr[GIT_OID_HEXSZ + 1]; const git_tree_entry *te; int entry_type; // dir -> * for (i = 0; i < max_i; ++i) { te = git_tree_entry_byindex(tree, i); entry_type = git_tree_entry_type(te); if (is_edge_allowed(GIT_OBJ_TREE, entry_type)) { git_oid_tostr(oidstr, sizeof(oidstr), git_tree_entry_id(te)); fprintf(out, "%s %s:%s:%s\n", swhpid, SWH_PREFIX, git_otype2swh(entry_type), oidstr); } } } -/* Emit edges for current object. */ +/* Emit node and edges for current object. */ int emit_obj(const git_oid *id, void *payload) { char oidstr[GIT_OID_HEXSZ + 1]; char swhpid[SWH_PIDSZ + 1]; size_t len; int obj_type; git_commit *commit; git_tag *tag; git_tree *tree; git_odb *odb = ((cb_payload *) payload)->odb; git_repository *repo = ((cb_payload *) payload)->repo; + FILE *nodes_out = ((cb_payload *) payload)->nodes_out; FILE *edges_out = ((cb_payload *) payload)->edges_out; check_lg2(git_odb_read_header(&len, &obj_type, odb, id), "cannot read object header", NULL); - if (!is_node_allowed(obj_type)) - return 0; - // format node PID + // emit node sprintf(swhpid, "swh:1:%s:", git_otype2swh(obj_type)); git_oid_tostr(swhpid + 10, sizeof(oidstr), id); - - // emit edges - switch (obj_type) { - case GIT_OBJ_BLOB: // graph leaf: no edges to emit - break; - case GIT_OBJ_COMMIT: - check_lg2(git_commit_lookup(&commit, repo, id), - "cannot find commit", NULL); - emit_commit_edges(commit, swhpid, edges_out); - git_commit_free(commit); - break; - case GIT_OBJ_TAG: - check_lg2(git_tag_lookup(&tag, repo, id), - "cannot find tag", NULL); - emit_tag_edges(tag, swhpid, edges_out); - git_tag_free(tag); - break; - case GIT_OBJ_TREE: - check_lg2(git_tree_lookup(&tree, repo, id), - "cannot find tree", NULL); - emit_tree_edges(tree, swhpid, edges_out); - git_tree_free(tree); - break; - default: - git_oid_tostr(oidstr, sizeof(oidstr), id); - fprintf(stderr, "ignoring unknown object: %s\n", oidstr); - break; + if (nodes_out != NULL && is_node_allowed(obj_type)) + fprintf(nodes_out, "%s\n", swhpid); + + if (edges_out != NULL) { + // emit edges + switch (obj_type) { + case GIT_OBJ_BLOB: // graph leaf: no edges to emit + break; + case GIT_OBJ_COMMIT: + check_lg2(git_commit_lookup(&commit, repo, id), + "cannot find commit", NULL); + emit_commit_edges(commit, swhpid, edges_out); + git_commit_free(commit); + break; + case GIT_OBJ_TAG: + check_lg2(git_tag_lookup(&tag, repo, id), + "cannot find tag", NULL); + emit_tag_edges(tag, swhpid, edges_out); + git_tag_free(tag); + break; + case GIT_OBJ_TREE: + check_lg2(git_tree_lookup(&tree, repo, id), + "cannot find tree", NULL); + emit_tree_edges(tree, swhpid, edges_out); + git_tree_free(tree); + break; + default: + git_oid_tostr(oidstr, sizeof(oidstr), id); + fprintf(stderr, "E: ignoring unknown object: %s\n", oidstr); + break; + } } return 0; } void exit_usage(char *msg) { if (msg != NULL) fprintf(stderr, "Error: %s\n\n", msg); fprintf(stderr, "Usage: git2graph [OPTION..] GIT_REPO_DIR\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -o, --output=PATH output file, default to stdout\n"); + fprintf(stderr, " -e, --edges-output=PATH edges output file (default: stdout)\n"); + fprintf(stderr, " -n, --nodes-output=PATH nodes output file (default: stdout)\n"); fprintf(stderr, " -E, --edges-filter=EDGES_EXPR only emit selected edges\n"); + fprintf(stderr, " -N, --nodes-filter=NODES_EXPR only emit selected nodes\n"); fprintf(stderr, "\n"); fprintf(stderr, "EDGES_EXPR is a comma separate list of src_TYPE:dst_TYPE pairs\n"); + fprintf(stderr, "NODES_EXPR is a comme separate list of node TYPEs\n"); + fprintf(stderr, "{NODES,EDGES}_EXPR can be empty strings to filter *out* all elements.\n"); fprintf(stderr, "TYPE is one of: cnt, dir, loc, ori, rel, rev, snp, *\n"); fprintf(stderr, "\nNote: you can use \"-\" for stdout in file names.\n"); - exit(EXIT_FAILURE); } /* command line arguments */ typedef struct { - char *outfile; + char *nodes_out; + char *edges_out; + char *nodes_filter; char *edges_filter; char *repo_dir; } cli_args; cli_args *parse_cli(int argc, char **argv) { int opt; cli_args *args = malloc(sizeof(cli_args)); if (args == NULL) { perror("Cannot allocate memory."); exit(EXIT_FAILURE); } else { - args->outfile = NULL; + args->nodes_out = NULL; + args->edges_out = NULL; + args->nodes_filter = NULL; args->edges_filter = NULL; args->repo_dir = NULL; } static struct option long_opts[] = { + {"edges-output", required_argument, 0, 'e' }, + {"nodes-output", required_argument, 0, 'n' }, {"edges-filter", required_argument, 0, 'E' }, - {"output", required_argument, 0, 'o' }, + {"nodes-filter", required_argument, 0, 'N' }, {"help", no_argument, 0, 'h' }, {0, 0, 0, 0 } }; - while ((opt = getopt_long(argc, argv, "E:o:h", long_opts, + while ((opt = getopt_long(argc, argv, "e:n:E:N:h", long_opts, NULL)) != -1) { switch (opt) { + case 'e': args->edges_out = optarg; break; + case 'n': args->nodes_out = optarg; break; case 'E': args->edges_filter = optarg; break; - case 'o': args->outfile = optarg; break; + case 'N': args->nodes_filter = optarg; break; case 'h': default: exit_usage(NULL); } } if (argv[optind] == NULL) exit_usage(NULL); args->repo_dir = argv[optind]; - if (args->outfile == NULL) - args->outfile = "-"; + if (args->edges_out == NULL) + args->edges_out = "-"; + if (args->nodes_out == NULL) + args->nodes_out = "-"; return args; } /* open output stream specified on the command line (if at all) */ FILE *open_out_stream(char *cli_path, char *buf, int bufsiz) { FILE *stream; if (cli_path == NULL) stream = NULL; else if (strcmp(cli_path, "-") == 0) stream = stdout; else if((stream = fopen(cli_path, "w")) == NULL) { fprintf(stderr, "can't open file: %s\n", cli_path); exit(EXIT_FAILURE); } // ensure atomic and non-interleaved writes if (stream != NULL) setvbuf(stream, buf, _IOFBF, bufsiz); return stream; } void fill_matrix(int matrix[OBJ_TYPES][OBJ_TYPES], int val) { for (int i = 0; i < OBJ_TYPES; i++) for (int j = 0; j < OBJ_TYPES; j++) matrix[i][j] = val; } void fill_row(int matrix[OBJ_TYPES][OBJ_TYPES], int row, int val) { for (int j = 0; j < OBJ_TYPES; j++) matrix[row][j] = val; } void fill_column(int matrix[OBJ_TYPES][OBJ_TYPES], int col, int val) { for (int i = 0; i < OBJ_TYPES; i++) matrix[i][col] = val; } void fill_vector(int vector[OBJ_TYPES], int val) { for (int i = 0; i < OBJ_TYPES; i++) vector[i] = val; } /* Dump node/edge filters to a given stream. For debugging purposes. */ void _dump_filters(FILE *out, int matrix[OBJ_TYPES][OBJ_TYPES], int vector[OBJ_TYPES]) { fprintf(out, "TO rev dir cnt rel snp ori loc FROM\n"); for(int i = 0; i < OBJ_TYPES; i++) { for(int j = 0; j < OBJ_TYPES; j++) fprintf(out, "%d ", matrix[i][j]); fprintf(out, "%s\n", _git_otype2swh[i]); } fprintf(out, " rev dir cnt rel snp ori loc\n"); for (int i = 0; i < OBJ_TYPES; i++) fprintf(out, "%d ", vector[i]); } /* set up nodes and edges restrictions, interpreting command line filters */ -void init_graph_filters(char *edges_filter) { +void init_graph_filters(char *nodes_filter, char *edges_filter) { char **filters; char **types; char **ptr; int src_type, dst_type; + // Note: when either filter is NULL, the parsing loops below will be + // skipped (due to g_strsplit's semantics on empty strings), which is + // what we want: all elements will be forbidden. + if (edges_filter != NULL) { fill_matrix(_allowed_edges, false); // nothing allowed by default filters = g_strsplit(edges_filter, ELT_SEP, -1); // "typ:typ" pairs for (ptr = filters; *ptr; ptr++) { types = g_strsplit(*ptr, PAIR_SEP, 2); // 2 "typ" fragments src_type = parse_otype(types[0]); dst_type = parse_otype(types[1]); if (src_type == GIT_OBJ_ANY && dst_type == GIT_OBJ_ANY) { // "*:*" wildcard fill_matrix(_allowed_edges, true); break; // all edges allowed already } else if (src_type == GIT_OBJ_ANY) { // "*:typ" wildcard fill_column(_allowed_edges, dst_type, true); } else if (dst_type == GIT_OBJ_ANY) { // "typ:*" wildcard fill_row(_allowed_edges, src_type, true); } else // "src_type:dst_type" _allowed_edges[src_type][dst_type] = true; g_strfreev(types); } g_strfreev(filters); } - init_allowed_nodes_from_edges(_allowed_edges, _allowed_nodes); + if (nodes_filter != NULL) { + fill_vector(_allowed_nodes, false); // nothing allowed by default + filters = g_strsplit(nodes_filter, ELT_SEP, -1); // "typ" fragments + for (ptr = filters; *ptr; ptr++) { + src_type = parse_otype(*ptr); + if (src_type == GIT_OBJ_ANY) { // "*" wildcard + fill_vector(_allowed_nodes, true); + break; // all nodes allowed already + } else + _allowed_nodes[src_type] = true; + } + g_strfreev(filters); + } } int main(int argc, char **argv) { git_repository *repo; git_odb *odb; int rc; cli_args *args; cb_payload *payload; - FILE *edges_out; - char edges_buf[EDGES_OUTSZ]; + FILE *nodes_out, *edges_out; + char nodes_buf[EDGES_OUTSZ], edges_buf[EDGES_OUTSZ]; args = parse_cli(argc, argv); - init_graph_filters(args->edges_filter); + init_graph_filters(args->nodes_filter, args->edges_filter); // _dump_filters(stdout, _allowed_edges, _allowed_nodes); git_libgit2_init(); check_lg2(git_repository_open(&repo, args->repo_dir), "cannot open repository", NULL); check_lg2(git_repository_odb(&odb, repo), "cannot get object DB", NULL); - edges_out = open_out_stream(args->outfile, edges_buf, EDGES_OUTSZ); + nodes_out = open_out_stream(args->nodes_out, nodes_buf, NODES_OUTSZ); + edges_out = open_out_stream(args->edges_out, edges_buf, EDGES_OUTSZ); + assert(NODES_OUTSZ <= PIPE_BUF && (NODES_OUTSZ % NODES_LINELEN == 0)); assert(EDGES_OUTSZ <= PIPE_BUF && (EDGES_OUTSZ % EDGES_LINELEN == 0)); payload = malloc(sizeof(cb_payload)); payload->odb = odb; payload->repo = repo; + payload->nodes_out = nodes_out; payload->edges_out = edges_out; rc = git_odb_foreach(odb, emit_obj, payload); check_lg2(rc, "failure during object iteration", NULL); git_odb_free(odb); git_repository_free(repo); free(payload); exit(rc); } diff --git a/tools/git2graph/tests/data/graphs/dir-nodes/edges.csv b/tools/git2graph/tests/data/graphs/dir-nodes/edges.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/dir-nodes/nodes.csv b/tools/git2graph/tests/data/graphs/dir-nodes/nodes.csv new file mode 100644 index 0000000..129fd0f --- /dev/null +++ b/tools/git2graph/tests/data/graphs/dir-nodes/nodes.csv @@ -0,0 +1,7 @@ +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 +swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 diff --git a/tools/git2graph/tests/data/graphs/directories/edges.csv b/tools/git2graph/tests/data/graphs/from-dir-edges/edges.csv similarity index 100% rename from tools/git2graph/tests/data/graphs/directories/edges.csv rename to tools/git2graph/tests/data/graphs/from-dir-edges/edges.csv diff --git a/tools/git2graph/tests/data/graphs/from-dir-edges/nodes.csv b/tools/git2graph/tests/data/graphs/from-dir-edges/nodes.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/releases/edges.csv b/tools/git2graph/tests/data/graphs/from-rel-edges/edges.csv similarity index 100% rename from tools/git2graph/tests/data/graphs/releases/edges.csv rename to tools/git2graph/tests/data/graphs/from-rel-edges/edges.csv diff --git a/tools/git2graph/tests/data/graphs/from-rel-edges/nodes.csv b/tools/git2graph/tests/data/graphs/from-rel-edges/nodes.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/fs-nodes/edges.csv b/tools/git2graph/tests/data/graphs/fs-nodes/edges.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/fs-nodes/nodes.csv b/tools/git2graph/tests/data/graphs/fs-nodes/nodes.csv new file mode 100644 index 0000000..505096b --- /dev/null +++ b/tools/git2graph/tests/data/graphs/fs-nodes/nodes.csv @@ -0,0 +1,13 @@ +swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 +swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 +swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca +swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 +swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 diff --git a/tools/git2graph/tests/data/graphs/full/nodes.csv b/tools/git2graph/tests/data/graphs/full/nodes.csv new file mode 100644 index 0000000..ea100d1 --- /dev/null +++ b/tools/git2graph/tests/data/graphs/full/nodes.csv @@ -0,0 +1,22 @@ +swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985 +swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23 +swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99 +swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6 +swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca +swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5 +swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3 +swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7 +swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8 +swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687 +swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c +swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c +swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5 +swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412 +swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5 +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 +swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 diff --git a/tools/git2graph/tests/data/graphs/revisions/edges.csv b/tools/git2graph/tests/data/graphs/rev-edges/edges.csv similarity index 100% rename from tools/git2graph/tests/data/graphs/revisions/edges.csv rename to tools/git2graph/tests/data/graphs/rev-edges/edges.csv diff --git a/tools/git2graph/tests/data/graphs/rev-edges/nodes.csv b/tools/git2graph/tests/data/graphs/rev-edges/nodes.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/rev-nodes/edges.csv b/tools/git2graph/tests/data/graphs/rev-nodes/edges.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/data/graphs/rev-nodes/nodes.csv b/tools/git2graph/tests/data/graphs/rev-nodes/nodes.csv new file mode 100644 index 0000000..e324159 --- /dev/null +++ b/tools/git2graph/tests/data/graphs/rev-nodes/nodes.csv @@ -0,0 +1,7 @@ +swh:1:rev:20cca959bae94594f60450f339b408581f1b401f +swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56 +swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039 +swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6 +swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6 +swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0 +swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9 diff --git a/tools/git2graph/tests/data/graphs/to-revisions/edges.csv b/tools/git2graph/tests/data/graphs/to-rev-edges/edges.csv similarity index 100% rename from tools/git2graph/tests/data/graphs/to-revisions/edges.csv rename to tools/git2graph/tests/data/graphs/to-rev-edges/edges.csv diff --git a/tools/git2graph/tests/data/graphs/to-rev-edges/nodes.csv b/tools/git2graph/tests/data/graphs/to-rev-edges/nodes.csv new file mode 100644 index 0000000..e69de29 diff --git a/tools/git2graph/tests/edge-filters.bats b/tools/git2graph/tests/edge-filters.bats new file mode 100644 index 0000000..086af6f --- /dev/null +++ b/tools/git2graph/tests/edge-filters.bats @@ -0,0 +1,23 @@ +#!/usr/bin/env bats + +load repo_helper + +@test "export revision self-edges" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E rev:rev + assert_equal_graphs ${DATA_DIR}/graphs/rev-edges ${TEST_TMPDIR} +} + +@test "export edges to revisions" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "*:rev" + assert_equal_graphs ${DATA_DIR}/graphs/to-rev-edges ${TEST_TMPDIR} +} + +@test "export edges from directories" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "dir:*" + assert_equal_graphs ${DATA_DIR}/graphs/from-dir-edges ${TEST_TMPDIR} +} + +@test "export edges from releases" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "rel:*" + assert_equal_graphs ${DATA_DIR}/graphs/from-rel-edges ${TEST_TMPDIR} +} diff --git a/tools/git2graph/tests/filters.bats b/tools/git2graph/tests/filters.bats deleted file mode 100644 index 8d99209..0000000 --- a/tools/git2graph/tests/filters.bats +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bats - -load repo_helper - -@test "export revisions" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E rev:rev - assert_equal_graphs ${DATA_DIR}/graphs/revisions ${TEST_TMPDIR} -} - -@test "export edges with revision targets" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:rev" - assert_equal_graphs ${DATA_DIR}/graphs/to-revisions ${TEST_TMPDIR} -} - -@test "export directories" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "dir:*" - assert_equal_graphs ${DATA_DIR}/graphs/directories ${TEST_TMPDIR} -} - -@test "export releases" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "rel:*" - assert_equal_graphs ${DATA_DIR}/graphs/releases ${TEST_TMPDIR} -} diff --git a/tools/git2graph/tests/full-graph.bats b/tools/git2graph/tests/full-graph.bats index 8c93a94..1f9d5c4 100644 --- a/tools/git2graph/tests/full-graph.bats +++ b/tools/git2graph/tests/full-graph.bats @@ -1,13 +1,8 @@ #!/usr/bin/env bats load repo_helper @test "export entire graph" { run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR} } - -@test "export entire graph (using wildcard)" { - run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:*" - assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR} -} diff --git a/tools/git2graph/tests/node-filters.bats b/tools/git2graph/tests/node-filters.bats new file mode 100644 index 0000000..622d27b --- /dev/null +++ b/tools/git2graph/tests/node-filters.bats @@ -0,0 +1,18 @@ +#!/usr/bin/env bats + +load repo_helper + +@test "export revision nodes" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N rev -E '' + assert_equal_graphs ${DATA_DIR}/graphs/rev-nodes ${TEST_TMPDIR} +} + +@test "export directory nodes" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N dir -E '' + assert_equal_graphs ${DATA_DIR}/graphs/dir-nodes ${TEST_TMPDIR} +} + +@test "export file system layer nodes" { + run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N cnt,dir -E '' + assert_equal_graphs ${DATA_DIR}/graphs/fs-nodes ${TEST_TMPDIR} +} diff --git a/tools/git2graph/tests/repo_helper.bash b/tools/git2graph/tests/repo_helper.bash index 36160b6..0d2c5dc 100644 --- a/tools/git2graph/tests/repo_helper.bash +++ b/tools/git2graph/tests/repo_helper.bash @@ -1,40 +1,41 @@ DATA_DIR="${BATS_TEST_DIRNAME}/data" TEST_REPO_TGZ="${DATA_DIR}/sample-repo.tgz" setup () { TEST_TMPDIR=$(mktemp -td swh-graph-test.XXXXXXXXXX) (cd "$TEST_TMPDIR" ; tar xaf "$TEST_REPO_TGZ") TEST_REPO_DIR="${TEST_TMPDIR}/sample-repo" } teardown () { rm -rf "$TEST_TMPDIR" } # Invoke git2graph (SUT) on the given repo_dir and store its results in the CSV # files nodes.csv and edges.csv located under the given dest_dir. run_git2graph () { repo_dir="$1" dest_dir="$2" shift 2 nodes_file="${dest_dir}/nodes.csv" edges_file="${dest_dir}/edges.csv" if [ ! -d "$dest_dir" ] ; then mkdir -p "$dest_dir" fi - ./git2graph "$@" "$repo_dir" | sort > "$edges_file" + ./git2graph "$@" -n >(sort > "$nodes_file") -e >(sort > "$edges_file") "$repo_dir" } # Ensure that two graphs, each specified as a dir that should contain a pair of # sorted, textual files called nodes.csv and edges.csv. Comparison is done # using diff. assert_equal_graphs () { dir_1="$1" dir_2="$2" + diff "${dir_1}/nodes.csv" "${dir_2}/nodes.csv" && diff "${dir_1}/edges.csv" "${dir_2}/edges.csv" }