Skip to content
Snippets Groups Projects
Commit 8d79b847 authored by Antoine Pietri's avatar Antoine Pietri
Browse files

graph export: compute node/edge type stats

parent d26a4247
No related branches found
No related tags found
No related merge requests found
......@@ -184,13 +184,22 @@ def sort_graph_nodes(export_path, config):
function) ;
- deflate the edges ;
- count the number of edges and write it in graph.edges.count.txt ;
- count the number of occurrences of each edge type and write them
in graph.edges.stats.txt ;
- concatenate all the (deflated) nodes from the export with the
destination edges, and sort the output to get the list of unique graph
nodes ;
- count the number of unique graph nodes and write it in
graph.nodes.count.txt ;
- count the number of occurrences of each node type and write them
in graph.nodes.stats.txt ;
- compress and write the resulting nodes in graph.nodes.csv.zst.
"""
# Use awk as a replacement of `sort | uniq -c` to avoid buffering everything
# in memory
counter_command = "awk '{ t[$0]++ } END { for (i in t) print i,t[i] }'"
# Use bytes for the sorting algorithm (faster than being locale-specific)
env = {
**os.environ.copy(),
......@@ -212,15 +221,20 @@ def sort_graph_nodes(export_path, config):
"tee {export_path}/graph.edges.csv.zst |"
"zstdcat |"
"tee >( wc -l > {export_path}/graph.edges.count.txt ) |"
"tee >( cut -d: -f3,6 | {counter_command} | sort "
" > {export_path}/graph.edges.stats.txt ) |"
"cut -d' ' -f2 | "
"cat - <( zstdcat {export_path}/*/*.nodes.csv.zst ) | "
"sort -u -S{sort_buffer_size} -T{buffer_path} | "
"tee >( wc -l > {export_path}/graph.nodes.count.txt ) |"
"tee >( cut -d: -f3 | {counter_command} | sort "
" > {export_path}/graph.nodes.stats.txt ) |"
"zstdmt > {export_path}/graph.nodes.csv.zst"
).format(
export_path=shlex.quote(str(export_path)),
buffer_path=shlex.quote(str(buffer_path)),
sort_buffer_size=shlex.quote(sort_buffer_size),
counter_command=counter_command,
),
],
env=env,
......
......@@ -3,6 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import collections
import hashlib
from typing import Tuple
......@@ -484,9 +485,32 @@ def test_sort_pipeline(tmp_path):
output_nodes = list(filter(bool, output_nodes))
output_edges = list(filter(bool, output_edges))
expected_nodes = set(input_nodes) | set(l.split()[1] for l in input_edges)
expected_nodes = set(input_nodes) | set(e.split()[1] for e in input_edges)
assert output_nodes == sorted(expected_nodes)
assert int((tmp_path / "graph.nodes.count.txt").read_text()) == len(expected_nodes)
assert sorted(output_edges) == sorted(input_edges)
assert int((tmp_path / "graph.edges.count.txt").read_text()) == len(input_edges)
actual_node_stats = (tmp_path / "graph.nodes.stats.txt").read_text().strip()
expected_node_stats = "\n".join(
sorted(
"{} {}".format(k, v)
for k, v in collections.Counter(
node.split(":")[2] for node in expected_nodes
).items()
)
)
assert actual_node_stats == expected_node_stats
actual_edge_stats = (tmp_path / "graph.edges.stats.txt").read_text().strip()
expected_edge_stats = "\n".join(
sorted(
"{} {}".format(k, v)
for k, v in collections.Counter(
"{}:{}".format(edge.split(":")[2], edge.split(":")[5])
for edge in input_edges
).items()
)
)
assert actual_edge_stats == expected_edge_stats
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment