Skip to content
Snippets Groups Projects
Commit 303d0173 authored by Stefano Zacchiroli's avatar Stefano Zacchiroli
Browse files

git2graph: add back node output support, with simpler/saner semantics

Rationale: generating the nodes file from the edges file is not reasonable in
terms of processing time. For linux.git alone, tr + sort -u can take up to 1
hour, depending on the sort setup. On the other hand outputing (unsorted, but
unique) nodes via git2graph adds near-zero overhead w.r.t. outputing edges.

The sane semantics for nodes/edges selection is to completely separate
filtering. The user is expected to filter nodes *and* edges on the command
line (if desired), and neither trickles to the other. So it is possible to,
say, emit "rev:rev" edges and "dir,cnt"; it is up to the user to select a
reasonable semantics.

It is also now possible to filter *out* all nodes/edges, passing empty strings
as filters. That might be needed when one really wants all and only nodes
corresponding to selected edges; in that case nodes output should be
suppressed, and tr+sort used separately.  Note that doing so is not always
desirable, as it excludes singleton nodes, not connected to anything at
all (which do exist!).

This commit partially reverts d2ff3227
parent a3aeaed4
No related branches found
No related tags found
No related merge requests found
Showing
with 174 additions and 121 deletions
......@@ -19,28 +19,16 @@ Test dependencies:
- [bats](https://github.com/bats-core/bats-core)
Nodes file
----------
`git2graph` outputs a textual edges file. If you also need a *nodes* file, with
one PID per line, you can postprocess the edges files as follows:
$ git2graph REPO_DIR > edges.csv
$ sort -u < edges.csv > nodes.csv
Micro benchmark
---------------
$ time ./git2graph -o >(pigz -c > edges.csv.gz) /srv/src/linux
./git2graph -o >(pigz -c > edges.csv.gz) /srv/src/linux 232,06s user 16,24s system 90% cpu 4:35,52 total
$ time ./git2graph -n >(pigz -c > nodes.csv.gz) -e >(pigz -c > edges.csv.gz) /srv/src/linux
232,06s user 16,24s system 90% cpu 4:40,35 total
$ zcat edges.csv.gz | wc -l
305095437
$ zcat edges.csv.gz | tr ' ' '\n' | sort -u | pigz -c > nodes.csv.gz
$ zcat nodes.csv.gz | wc -l
6503402
$ zcat edges.csv.gz | wc -l
305095437
Parallel use
......@@ -52,10 +40,11 @@ bytes on Linux, and guaranteed to be at least 512 bytes by POSIX), writes are
atomic. Hence it is possible to mass analyze many repositories in parallel with
something like:
$ mkfifo edges.fifo
$ mkfifo nodes.fifo edges.fifo
$ sort -u < nodes.fifo | pigz -c > nodes.csv.gz &
$ sort -u < edges.fifo | pigz -c > edges.csv.gz &
$ parallel git2graph -o edges.fifo -- repo_dir_1 repo_dir_2 ...
$ rm edges.fifo
$ parallel git2graph -n nodes.fifo -e edges.fifo -- repo_dir_1 repo_dir_2 ...
$ rm nodes.fifo edges.fifo
Note that you most likely want to tune `sort` in order to be parallel
(`--parallel`), use a large buffer size (`-S`), and use a temporary directory
......
......@@ -5,9 +5,10 @@
* See top-level LICENSE file for more information
*/
/* Crawl a Git repository and output it as a graph, i.e., as textual file
* containing a list of graph edges, one per line. Each edge is a <from, to>
* pair of Software Heritage (SWH) Persistent Identifiers (PIDs).
/* Crawls a Git repository and outputs it as a graph, i.e., as a pair of
* textual files <nodes, edges>. The nodes file will contain a list of graph
* nodes as Software Heritage (SWH) Persistent Identifiers (PIDs); the edges
* file a list of graph edges as <from, to> PID pairs.
*/
#include <assert.h>
......@@ -28,12 +29,15 @@
#define SWH_REV "swh:1:rev"
#define SWH_PIDSZ (GIT_OID_HEXSZ + 10) // size of a SWH PID
// length of a textual edge line
// line-lengths in nodes and edges file
#define NODES_LINELEN (SWH_PIDSZ + 1)
#define EDGES_LINELEN (SWH_PIDSZ * 2 + 2)
// Output buffer size for edges files. To guarantee atomic and non-interleaved
// writes (which matter when used concurrently writing to a shared FIFO), size
// must be <= PIPE_BUF and a multiple of EDGES_LINELEN.
// Output buffer sizes for nodes and edges files. To guarantee atomic and
// non-interleaved writes (which matter when used concurrently writing to a
// shared FIFO), these sizes must be <= PIPE_BUF and multiples of
// {NODES,EDGES}_LINELEN.
#define NODES_OUTSZ ((PIPE_BUF / NODES_LINELEN) * NODES_LINELEN)
#define EDGES_OUTSZ ((PIPE_BUF / EDGES_LINELEN) * EDGES_LINELEN)
// GIT_OBJ_* constants extension for non-git objects
......@@ -91,8 +95,7 @@ static int _allowed_edges[OBJ_TYPES][OBJ_TYPES] = {
{true, true, true, true, true, true, true, true}, // | loc
};
/* Whether a nore type is allowed as *origin* for edges. Derived information
* from the _allowed_edges matrix. */
/* Allowed node types vector. */
static int _allowed_nodes[OBJ_TYPES] = {
true, //
true, // rev
......@@ -112,6 +115,7 @@ static int _allowed_nodes[OBJ_TYPES] = {
typedef struct {
git_odb *odb; // Git object DB
git_repository *repo; // Git repository
FILE *nodes_out; // stream to write nodes to, or NULL
FILE *edges_out; // stream to write edges to, or NULL
} cb_payload;
......@@ -144,25 +148,6 @@ void check_lg2(int error, const char *message, const char *extra) {
}
/* Compute allowed node types based on allowed edge types. */
void init_allowed_nodes_from_edges(
int allowed_edges[OBJ_TYPES][OBJ_TYPES],
int allowed_nodes[OBJ_TYPES])
{
for (int i = 0; i < OBJ_TYPES; i++) {
allowed_nodes[i] = false; // disallowed by default
// allowed if an edge can originate from it...
for (int src_type = 0; src_type < OBJ_TYPES; src_type++)
allowed_nodes[i] = allowed_nodes[i] \
|| allowed_edges[src_type][i];
// ...or lead to it
for (int dst_type = 0; dst_type < OBJ_TYPES; dst_type++)
allowed_nodes[i] = allowed_nodes[i] \
|| allowed_edges[i][dst_type];
}
}
/* Emit commit edges. */
void emit_commit_edges(const git_commit *commit, const char *swhpid, FILE *out) {
unsigned int i, max_i;
......@@ -222,7 +207,7 @@ void emit_tree_edges(const git_tree *tree, const char *swhpid, FILE *out) {
}
/* Emit edges for current object. */
/* Emit node and edges for current object. */
int emit_obj(const git_oid *id, void *payload) {
char oidstr[GIT_OID_HEXSZ + 1];
char swhpid[SWH_PIDSZ + 1];
......@@ -234,43 +219,46 @@ int emit_obj(const git_oid *id, void *payload) {
git_odb *odb = ((cb_payload *) payload)->odb;
git_repository *repo = ((cb_payload *) payload)->repo;
FILE *nodes_out = ((cb_payload *) payload)->nodes_out;
FILE *edges_out = ((cb_payload *) payload)->edges_out;
check_lg2(git_odb_read_header(&len, &obj_type, odb, id),
"cannot read object header", NULL);
if (!is_node_allowed(obj_type))
return 0;
// format node PID
// emit node
sprintf(swhpid, "swh:1:%s:", git_otype2swh(obj_type));
git_oid_tostr(swhpid + 10, sizeof(oidstr), id);
// emit edges
switch (obj_type) {
case GIT_OBJ_BLOB: // graph leaf: no edges to emit
break;
case GIT_OBJ_COMMIT:
check_lg2(git_commit_lookup(&commit, repo, id),
"cannot find commit", NULL);
emit_commit_edges(commit, swhpid, edges_out);
git_commit_free(commit);
break;
case GIT_OBJ_TAG:
check_lg2(git_tag_lookup(&tag, repo, id),
"cannot find tag", NULL);
emit_tag_edges(tag, swhpid, edges_out);
git_tag_free(tag);
break;
case GIT_OBJ_TREE:
check_lg2(git_tree_lookup(&tree, repo, id),
"cannot find tree", NULL);
emit_tree_edges(tree, swhpid, edges_out);
git_tree_free(tree);
break;
default:
git_oid_tostr(oidstr, sizeof(oidstr), id);
fprintf(stderr, "ignoring unknown object: %s\n", oidstr);
break;
if (nodes_out != NULL && is_node_allowed(obj_type))
fprintf(nodes_out, "%s\n", swhpid);
if (edges_out != NULL) {
// emit edges
switch (obj_type) {
case GIT_OBJ_BLOB: // graph leaf: no edges to emit
break;
case GIT_OBJ_COMMIT:
check_lg2(git_commit_lookup(&commit, repo, id),
"cannot find commit", NULL);
emit_commit_edges(commit, swhpid, edges_out);
git_commit_free(commit);
break;
case GIT_OBJ_TAG:
check_lg2(git_tag_lookup(&tag, repo, id),
"cannot find tag", NULL);
emit_tag_edges(tag, swhpid, edges_out);
git_tag_free(tag);
break;
case GIT_OBJ_TREE:
check_lg2(git_tree_lookup(&tree, repo, id),
"cannot find tree", NULL);
emit_tree_edges(tree, swhpid, edges_out);
git_tree_free(tree);
break;
default:
git_oid_tostr(oidstr, sizeof(oidstr), id);
fprintf(stderr, "E: ignoring unknown object: %s\n", oidstr);
break;
}
}
return 0;
......@@ -284,21 +272,26 @@ void exit_usage(char *msg) {
fprintf(stderr, "Usage: git2graph [OPTION..] GIT_REPO_DIR\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -o, --output=PATH output file, default to stdout\n");
fprintf(stderr, " -e, --edges-output=PATH edges output file (default: stdout)\n");
fprintf(stderr, " -n, --nodes-output=PATH nodes output file (default: stdout)\n");
fprintf(stderr, " -E, --edges-filter=EDGES_EXPR only emit selected edges\n");
fprintf(stderr, " -N, --nodes-filter=NODES_EXPR only emit selected nodes\n");
fprintf(stderr, "\n");
fprintf(stderr, "EDGES_EXPR is a comma separate list of src_TYPE:dst_TYPE pairs\n");
fprintf(stderr, "NODES_EXPR is a comme separate list of node TYPEs\n");
fprintf(stderr, "{NODES,EDGES}_EXPR can be empty strings to filter *out* all elements.\n");
fprintf(stderr, "TYPE is one of: cnt, dir, loc, ori, rel, rev, snp, *\n");
fprintf(stderr, "\nNote: you can use \"-\" for stdout in file names.\n");
exit(EXIT_FAILURE);
}
/* command line arguments */
typedef struct {
char *outfile;
char *nodes_out;
char *edges_out;
char *nodes_filter;
char *edges_filter;
char *repo_dir;
} cli_args;
......@@ -312,23 +305,29 @@ cli_args *parse_cli(int argc, char **argv) {
perror("Cannot allocate memory.");
exit(EXIT_FAILURE);
} else {
args->outfile = NULL;
args->nodes_out = NULL;
args->edges_out = NULL;
args->nodes_filter = NULL;
args->edges_filter = NULL;
args->repo_dir = NULL;
}
static struct option long_opts[] = {
{"edges-output", required_argument, 0, 'e' },
{"nodes-output", required_argument, 0, 'n' },
{"edges-filter", required_argument, 0, 'E' },
{"output", required_argument, 0, 'o' },
{"nodes-filter", required_argument, 0, 'N' },
{"help", no_argument, 0, 'h' },
{0, 0, 0, 0 }
};
while ((opt = getopt_long(argc, argv, "E:o:h", long_opts,
while ((opt = getopt_long(argc, argv, "e:n:E:N:h", long_opts,
NULL)) != -1) {
switch (opt) {
case 'e': args->edges_out = optarg; break;
case 'n': args->nodes_out = optarg; break;
case 'E': args->edges_filter = optarg; break;
case 'o': args->outfile = optarg; break;
case 'N': args->nodes_filter = optarg; break;
case 'h':
default:
exit_usage(NULL);
......@@ -338,8 +337,10 @@ cli_args *parse_cli(int argc, char **argv) {
exit_usage(NULL);
args->repo_dir = argv[optind];
if (args->outfile == NULL)
args->outfile = "-";
if (args->edges_out == NULL)
args->edges_out = "-";
if (args->nodes_out == NULL)
args->nodes_out = "-";
return args;
}
......@@ -407,12 +408,16 @@ void _dump_filters(FILE *out, int matrix[OBJ_TYPES][OBJ_TYPES], int vector[OBJ_T
/* set up nodes and edges restrictions, interpreting command line filters */
void init_graph_filters(char *edges_filter) {
void init_graph_filters(char *nodes_filter, char *edges_filter) {
char **filters;
char **types;
char **ptr;
int src_type, dst_type;
// Note: when either filter is NULL, the parsing loops below will be
// skipped (due to g_strsplit's semantics on empty strings), which is
// what we want: all elements will be forbidden.
if (edges_filter != NULL) {
fill_matrix(_allowed_edges, false); // nothing allowed by default
filters = g_strsplit(edges_filter, ELT_SEP, -1); // "typ:typ" pairs
......@@ -437,7 +442,19 @@ void init_graph_filters(char *edges_filter) {
g_strfreev(filters);
}
init_allowed_nodes_from_edges(_allowed_edges, _allowed_nodes);
if (nodes_filter != NULL) {
fill_vector(_allowed_nodes, false); // nothing allowed by default
filters = g_strsplit(nodes_filter, ELT_SEP, -1); // "typ" fragments
for (ptr = filters; *ptr; ptr++) {
src_type = parse_otype(*ptr);
if (src_type == GIT_OBJ_ANY) { // "*" wildcard
fill_vector(_allowed_nodes, true);
break; // all nodes allowed already
} else
_allowed_nodes[src_type] = true;
}
g_strfreev(filters);
}
}
......@@ -447,11 +464,11 @@ int main(int argc, char **argv) {
int rc;
cli_args *args;
cb_payload *payload;
FILE *edges_out;
char edges_buf[EDGES_OUTSZ];
FILE *nodes_out, *edges_out;
char nodes_buf[EDGES_OUTSZ], edges_buf[EDGES_OUTSZ];
args = parse_cli(argc, argv);
init_graph_filters(args->edges_filter);
init_graph_filters(args->nodes_filter, args->edges_filter);
// _dump_filters(stdout, _allowed_edges, _allowed_nodes);
git_libgit2_init();
......@@ -460,12 +477,15 @@ int main(int argc, char **argv) {
check_lg2(git_repository_odb(&odb, repo),
"cannot get object DB", NULL);
edges_out = open_out_stream(args->outfile, edges_buf, EDGES_OUTSZ);
nodes_out = open_out_stream(args->nodes_out, nodes_buf, NODES_OUTSZ);
edges_out = open_out_stream(args->edges_out, edges_buf, EDGES_OUTSZ);
assert(NODES_OUTSZ <= PIPE_BUF && (NODES_OUTSZ % NODES_LINELEN == 0));
assert(EDGES_OUTSZ <= PIPE_BUF && (EDGES_OUTSZ % EDGES_LINELEN == 0));
payload = malloc(sizeof(cb_payload));
payload->odb = odb;
payload->repo = repo;
payload->nodes_out = nodes_out;
payload->edges_out = edges_out;
rc = git_odb_foreach(odb, emit_obj, payload);
......
swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3
swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7
swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8
swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687
swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c
swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c
swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5
swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985
swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23
swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99
swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6
swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca
swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5
swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3
swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7
swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8
swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687
swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c
swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c
swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5
swh:1:cnt:100b0dec8c53a40e4de7714b2c612dad5fad9985
swh:1:cnt:1fe912cdd835ae6be5feb79acafaa5fa8ea60f23
swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99
swh:1:cnt:5716ca5987cbf97d6bb54920bea6adde242d87e6
swh:1:cnt:76018072e09c5d31c8c6e3113b8aa0fe625195ca
swh:1:cnt:b210800439ffe3f2db0d47d9aab1969b38a770a5
swh:1:dir:0f9566327353acd6cba286508a56e71376fcfda3
swh:1:dir:205f6b799e7d5c2524468ca006a0131aa57ecce7
swh:1:dir:2312eb97a90b5e561508b4197c89f092f8fd5ef8
swh:1:dir:5917a22fb466d2088d926749b7362836f3f05687
swh:1:dir:89ff1a2aefcbff0f09197f0fd8beeb19a7b6e51c
swh:1:dir:a83dd64716d4b1afeb9821d2018ade21696a6d9c
swh:1:dir:e03c0f3158ec6b1432c83e2c093a8a293a4f58e5
swh:1:rel:1720af781051a8cafdf3cf134c263ec5c5e72412
swh:1:rel:d48ad9915be780fcfa296985f69df35e144864a5
swh:1:rev:20cca959bae94594f60450f339b408581f1b401f
swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56
swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039
swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6
swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6
swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0
swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9
swh:1:rev:20cca959bae94594f60450f339b408581f1b401f
swh:1:rev:261586c455130b4bf10a5be7ffb0bf4077581b56
swh:1:rev:4d267d40bc0dbbfaf1f5096de9873ca42ae03039
swh:1:rev:8fcfd562b8abe4573313d02e864b7df7d31537f6
swh:1:rev:945cc4759b4cc02c7ed57bcafeea82f3656f7bc6
swh:1:rev:9bf3ce249cf3d74ef57d5a1fb4227e26818553f0
swh:1:rev:bfbf4af79c903a8b2d8eacfacddef16467062fd9
#!/usr/bin/env bats
load repo_helper
@test "export revision self-edges" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E rev:rev
assert_equal_graphs ${DATA_DIR}/graphs/rev-edges ${TEST_TMPDIR}
}
@test "export edges to revisions" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "*:rev"
assert_equal_graphs ${DATA_DIR}/graphs/to-rev-edges ${TEST_TMPDIR}
}
@test "export edges from directories" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "dir:*"
assert_equal_graphs ${DATA_DIR}/graphs/from-dir-edges ${TEST_TMPDIR}
}
@test "export edges from releases" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -N '' -E "rel:*"
assert_equal_graphs ${DATA_DIR}/graphs/from-rel-edges ${TEST_TMPDIR}
}
#!/usr/bin/env bats
load repo_helper
@test "export revisions" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E rev:rev
assert_equal_graphs ${DATA_DIR}/graphs/revisions ${TEST_TMPDIR}
}
@test "export edges with revision targets" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:rev"
assert_equal_graphs ${DATA_DIR}/graphs/to-revisions ${TEST_TMPDIR}
}
@test "export directories" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "dir:*"
assert_equal_graphs ${DATA_DIR}/graphs/directories ${TEST_TMPDIR}
}
@test "export releases" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "rel:*"
assert_equal_graphs ${DATA_DIR}/graphs/releases ${TEST_TMPDIR}
}
......@@ -6,8 +6,3 @@ load repo_helper
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR"
assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR}
}
@test "export entire graph (using wildcard)" {
run_git2graph "$TEST_REPO_DIR" "$TEST_TMPDIR" -E "*:*"
assert_equal_graphs ${DATA_DIR}/graphs/full ${TEST_TMPDIR}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment