graph export: compute node/edge type stats

8d79b847 · Antoine Pietri · d26a4247 · 8d79b847 · 8d79b847
Commit 8d79b847 authored 4 years ago by Antoine Pietri
--- a/swh/dataset/graph.py
+++ b/swh/dataset/graph.py
@@ -184,13 +184,22 @@ def sort_graph_nodes(export_path, config):
       function) ;
     - deflate the edges ;
     - count the number of edges and write it in graph.edges.count.txt ;
+     - count the number of occurrences of each edge type and write them
+       in graph.edges.stats.txt ;
     - concatenate all the (deflated) nodes from the export with the
       destination edges, and sort the output to get the list of unique graph
       nodes ;
     - count the number of unique graph nodes and write it in
       graph.nodes.count.txt ;
+     - count the number of occurrences of each node type and write them
+       in graph.nodes.stats.txt ;
     - compress and write the resulting nodes in graph.nodes.csv.zst.
    """
+
+    # Use awk as a replacement of `sort | uniq -c` to avoid buffering everything
+    # in memory
+    counter_command = "awk '{ t[$0]++ } END { for (i in t) print i,t[i] }'"
+
    # Use bytes for the sorting algorithm (faster than being locale-specific)
    env = {
        **os.environ.copy(),
@@ -212,15 +221,20 @@ def sort_graph_nodes(export_path, config):
                    "tee {export_path}/graph.edges.csv.zst |"
                    "zstdcat |"
                    "tee >( wc -l > {export_path}/graph.edges.count.txt ) |"
+                    "tee >( cut -d: -f3,6 | {counter_command} | sort "
+                    "           > {export_path}/graph.edges.stats.txt ) |"
                    "cut -d' ' -f2 | "
                    "cat - <( zstdcat {export_path}/*/*.nodes.csv.zst ) | "
                    "sort -u -S{sort_buffer_size} -T{buffer_path} | "
                    "tee >( wc -l > {export_path}/graph.nodes.count.txt ) |"
+                    "tee >( cut -d: -f3 | {counter_command} | sort "
+                    "           > {export_path}/graph.nodes.stats.txt ) |"
                    "zstdmt > {export_path}/graph.nodes.csv.zst"
                ).format(
                    export_path=shlex.quote(str(export_path)),
                    buffer_path=shlex.quote(str(buffer_path)),
                    sort_buffer_size=shlex.quote(sort_buffer_size),
+                    counter_command=counter_command,
                ),
            ],
            env=env,

--- a/swh/dataset/test/test_graph.py
+++ b/swh/dataset/test/test_graph.py
@@ -3,6 +3,7 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+import collections
 import hashlib
 from typing import Tuple

@@ -484,9 +485,32 @@ def test_sort_pipeline(tmp_path):
    output_nodes = list(filter(bool, output_nodes))
    output_edges = list(filter(bool, output_edges))

-    expected_nodes = set(input_nodes) | set(l.split()[1] for l in input_edges)
+    expected_nodes = set(input_nodes) | set(e.split()[1] for e in input_edges)
    assert output_nodes == sorted(expected_nodes)
    assert int((tmp_path / "graph.nodes.count.txt").read_text()) == len(expected_nodes)

    assert sorted(output_edges) == sorted(input_edges)
    assert int((tmp_path / "graph.edges.count.txt").read_text()) == len(input_edges)
+
+    actual_node_stats = (tmp_path / "graph.nodes.stats.txt").read_text().strip()
+    expected_node_stats = "\n".join(
+        sorted(
+            "{} {}".format(k, v)
+            for k, v in collections.Counter(
+                node.split(":")[2] for node in expected_nodes
+            ).items()
+        )
+    )
+    assert actual_node_stats == expected_node_stats
+
+    actual_edge_stats = (tmp_path / "graph.edges.stats.txt").read_text().strip()
+    expected_edge_stats = "\n".join(
+        sorted(
+            "{} {}".format(k, v)
+            for k, v in collections.Counter(
+                "{}:{}".format(edge.split(":")[2], edge.split(":")[5])
+                for edge in input_edges
+            ).items()
+        )
+    )
+    assert actual_edge_stats == expected_edge_stats