git2graph: make sure it can be used concurrently and document how

ce8b9e03 · Stefano Zacchiroli · 721cb191 · ce8b9e03 · ce8b9e03
Commit ce8b9e03 authored 5 years ago by Stefano Zacchiroli
--- a/tools/git2graph/README.md
+++ b/tools/git2graph/README.md
@@ -11,10 +11,33 @@ Micro benchmark
 ---------------

    $ time ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz)
-    ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz  233,67s user 15,76s system 91% cpu 4:32,62 total
+    ./git2graph /srv/src/linux >(pigz -c > nodes.csv.gz) >(pigz -c > edges.csv.gz  243,30s user 17,28s system 89% cpu 4:51,53 total
    
    $ zcat nodes.csv.gz | wc -l
    6503402
    
    $ zcat edges.csv.gz | wc -l
    305095437
+
+
+Parallel use
+------------
+
+`git2graph` writes fixed-length lines, long either 51 bytes (nodes) or 102
+bytes (edges). When writing to a FIFO less than `PIPE_BUF` bytes (which is 4096
+bytes on Linux, and guaranteed to be at least 512 bytes by POSIX), writes are
+atomic. Hence it is possible to mass analyze many repositories in parallel with
+something like:
+
+    $ mkfifo nodes.fifo edges.fifo
+    $ sort -u < nodes.fifo | pigz -c > nodes.csv.gz &
+    $ sort -u < edges.fifo | pigz -c > edges.csv.gz &
+    $ parallel -i git2graph '{}' nodes.fifo edges.fifo -- repo_dir_1 repo_dir_2 ...
+    $ rm nodes.fifo edges.fifo
+
+Note that you most likely want to tune `sort` in order to be parallel
+(`--parallel`), use a large buffer size (`-S`), and use a temporary directory
+with enough available space (`-T`).  (The above example uses `parallel`
+from [moreutils](https://joeyh.name/code/moreutils/), but it could trivially be
+adapted to use [GNU parallel](https://www.gnu.org/software/parallel/) or
+similar parallelization tools.)
--- a/tools/git2graph/git2graph.c
+++ b/tools/git2graph/git2graph.c
@@ -11,14 +11,30 @@
 * of graph edges as <from, to> PID pairs.
 */

-#include <git2.h>
+#include <assert.h>
+#include <limits.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

+#include <git2.h>
+
+
 #define SWH_PREFIX  "swh:1"
 #define SWH_DIR     "swh:1:dir"
 #define SWH_REV     "swh:1:rev"
-#define SWH_PIDSZ   GIT_OID_HEXSZ + 10  // size of a SWH PID
+#define SWH_PIDSZ   (GIT_OID_HEXSZ + 10)  // size of a SWH PID
+
+// line-lengths in nodes and edges file
+#define NODES_LINELEN  (SWH_PIDSZ + 1)
+#define EDGES_LINELEN  (SWH_PIDSZ * 2 + 2)
+
+// Output buffer sizes for nodes and edges files. To guarantee atomic and
+// non-interleaved writes (which matter when used concurrently writing to a
+// shared FIFO), these sizes must be <= PIPE_BUF and multiples of
+// {NODES,EDGES}_LINELEN.
+#define NODES_OUTSZ  ((PIPE_BUF / NODES_LINELEN) * NODES_LINELEN)
+#define EDGES_OUTSZ  ((PIPE_BUF / EDGES_LINELEN) * EDGES_LINELEN)


 /* extra payload for callback invoked on Git objects */
@@ -177,6 +193,8 @@ int main(int argc, char **argv) {
 	int rc;
 	cb_payload *payload;
 	FILE *nodes, *edges;
+	char nodes_buf[NODES_OUTSZ];
+	char edges_buf[EDGES_OUTSZ];
 	
 	if (argc != 4) {
 		fprintf(stderr,
@@ -204,6 +222,12 @@ int main(int argc, char **argv) {
 		exit(EXIT_FAILURE);
 	}

+	// ensure atomic and non-interleaved writes
+	assert(NODES_OUTSZ <= PIPE_BUF && (NODES_OUTSZ % NODES_LINELEN == 0));
+	assert(EDGES_OUTSZ <= PIPE_BUF && (EDGES_OUTSZ % EDGES_LINELEN == 0));
+	setvbuf(nodes, nodes_buf, _IOFBF, NODES_OUTSZ);
+	setvbuf(edges, edges_buf, _IOFBF, EDGES_OUTSZ);
+
 	payload = malloc(sizeof(cb_payload));
 	payload->odb = odb;
 	payload->repo = repo;