diff --git a/java/src/main/java/org/softwareheritage/graph/utils/TopoSort.java b/java/src/main/java/org/softwareheritage/graph/utils/TopoSort.java index 32a13877fb72420606dadd0b1206a22254f5099b..08bbffe04f885907dd0b67b42e90a37984f7a403 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/TopoSort.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/TopoSort.java @@ -25,7 +25,7 @@ import java.util.*; * * Sample invocation: * - * $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx1000G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.TopoSort /dev/shm/swh-graph/default/graph 'rev,rel,snp,ori' \ + * $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx1000G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.TopoSort /dev/shm/swh-graph/default/graph backward 'rev,rel,snp,ori' \ * | pv --line-mode --wait \ * | zstdmt \ * > /poolswh/softwareheritage/vlorentz/2022-04-25_toposort_rev,rel,snp,ori.txt.zst @@ -36,20 +36,37 @@ public class TopoSort { private Subgraph transposedGraph; public static void main(String[] args) throws IOException, ClassNotFoundException { - if (args.length != 2) { - System.err.println("Syntax: java org.softwareheritage.graph.utils.TopoSort <path/to/graph> <nodeTypes>"); + if (args.length != 3) { + System.err.println( + "Syntax: java org.softwareheritage.graph.utils.TopoSort <path/to/graph> {forward|backward} <nodeTypes>"); System.exit(1); } String graphPath = args[0]; - String nodeTypes = args[1]; + String directionString = args[1]; + String nodeTypes = args[2]; TopoSort toposort = new TopoSort(); - toposort.load_graph(graphPath, nodeTypes); + toposort.loadGraph(graphPath, nodeTypes); + + if (directionString.equals("forward")) { + toposort.swapGraphs(); + } else if (!directionString.equals("backward")) { + System.err.println("Invalid direction " + directionString); + System.exit(1); + } + toposort.toposortDFS(); } - public void load_graph(String graphBasename, String nodeTypes) throws IOException { + public void swapGraphs() { + Subgraph tmp; + tmp = graph; + graph = transposedGraph; + transposedGraph = tmp; + } + + public void loadGraph(String graphBasename, String nodeTypes) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); var underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Selecting subgraphs."); diff --git a/swh/graph/cli.py b/swh/graph/cli.py index cb58847c377bc2b0ea823f638483bc6dc6f715f4..b45648e2c6c8f25399030af42d745a1039b1beea 100644 --- a/swh/graph/cli.py +++ b/swh/graph/cli.py @@ -383,7 +383,7 @@ def luigi( local_export_path=dataset_path, local_graph_path=dataset_path / "compressed", derived_datasets_path=dataset_path, - topological_order_path=dataset_path / "topology/topological_order_dfs.csv.zst", + topological_order_dir=dataset_path / "topology/", origin_contributors_path=dataset_path / "datasets/contribution_graph.csv.zst", origin_urls_path=dataset_path / "datasets/origin_urls.csv.zst", export_id=f"{dataset_name}-{secrets.token_hex(10)}", diff --git a/swh/graph/luigi/misc_datasets.py b/swh/graph/luigi/misc_datasets.py index f755a7fb851a1bf26df0dbe63d71440875781e33..643d6d8aa649a28b1f584d77ec06ab62fb2f4618 100644 --- a/swh/graph/luigi/misc_datasets.py +++ b/swh/graph/luigi/misc_datasets.py @@ -34,6 +34,7 @@ And optionally:: # WARNING: do not import unnecessary things here to keep cli startup time under # control +from pathlib import Path from typing import List import luigi @@ -41,14 +42,18 @@ import luigi from .compressed_graph import LocalGraph from .utils import run_script +OBJECT_TYPES = {"ori", "snp", "rel", "rev", "dir", "cnt"} + class TopoSort(luigi.Task): """Creates a file that contains all SWHIDs in topological order from a compressed graph.""" local_graph_path = luigi.PathParameter() - topological_order_path = luigi.PathParameter() + topological_order_dir = luigi.PathParameter() graph_name = luigi.Parameter(default="graph") + object_types = luigi.Parameter() + direction = luigi.ChoiceParameter(choices=["forward", "backward"]) def requires(self) -> List[luigi.Task]: """Returns an instance of :class:`LocalGraph`.""" @@ -56,15 +61,20 @@ class TopoSort(luigi.Task): def output(self) -> luigi.Target: """.csv.zst file that contains the topological order.""" - return luigi.LocalTarget(self.topological_order_path) + return luigi.LocalTarget( + self.topological_order_dir + / f"topological_order_dfs_{self.direction}_{self.object_types}.csv.zst" + ) def run(self) -> None: """Runs org.softwareheritage.graph.utils.TopoSort and compresses""" - object_types = "rev,rel,snp,ori" + invalid_object_types = set(self.object_types.split(",")) - OBJECT_TYPES + if invalid_object_types: + raise ValueError(f"Invalid object types: {invalid_object_types}") class_name = "org.softwareheritage.graph.utils.TopoSort" script = f""" - java {class_name} '{self.local_graph_path}/{self.graph_name}' '{object_types}' \ + java {class_name} '{self.local_graph_path}/{self.graph_name}' '{self.direction}' '{self.object_types}' \ | pv --line-mode --wait \ | zstdmt -19 - """ - run_script(script, self.topological_order_path) + """ # noqa + run_script(script, Path(self.output().path)) diff --git a/swh/graph/luigi/origin_contributors.py b/swh/graph/luigi/origin_contributors.py index f9a7e0f80bdef21485571a77a2e2cd62b9520a75..726c4dafe70b10a2ecf46244543e4c276841ffa4 100644 --- a/swh/graph/luigi/origin_contributors.py +++ b/swh/graph/luigi/origin_contributors.py @@ -28,22 +28,24 @@ class ListOriginContributors(luigi.Task): graph.""" local_graph_path = luigi.PathParameter() - topological_order_path = luigi.PathParameter() + topological_order_dir = luigi.PathParameter() origin_contributors_path = luigi.PathParameter() origin_urls_path = luigi.PathParameter() graph_name = luigi.Parameter(default="graph") - def requires(self) -> List[luigi.Task]: + def requires(self) -> Dict[str, luigi.Task]: """Returns an instance of :class:`swh.graph.luigi.compressed_graph.LocalGraph` and :class:`swh.graph.luigi.misc_datasets.TopoSort`.""" - return [ - LocalGraph(local_graph_path=self.local_graph_path), - TopoSort( + return { + "graph": LocalGraph(local_graph_path=self.local_graph_path), + "toposort": TopoSort( local_graph_path=self.local_graph_path, - topological_order_path=self.topological_order_path, + topological_order_dir=self.topological_order_dir, graph_name=self.graph_name, + direction="backward", + object_types="rev,rel,snp,ori", ), - ] + } def output(self) -> luigi.Target: """.csv.zst file that contains the topological order.""" @@ -53,14 +55,16 @@ class ListOriginContributors(luigi.Task): """Runs org.softwareheritage.graph.utils.TopoSort and compresses""" import tempfile + topological_order_path = Path(self.input()["toposort"].path) + class_name = "org.softwareheritage.graph.utils.ListOriginContributors" with tempfile.NamedTemporaryFile( prefix="origin_urls_", suffix=".csv" ) as origin_urls_fd: script = f""" - zstdcat {self.topological_order_path} \ + zstdcat {topological_order_path} \ | java {class_name} '{self.local_graph_path}/{self.graph_name}' '{origin_urls_fd.name}' \ - | pv --line-mode --wait --size $(zstdcat '{self.topological_order_path}' | wc -l) \ + | pv --line-mode --wait --size $(zstdcat '{topological_order_path}' | wc -l) \ | zstdmt -19 """ # noqa run_script(script, self.origin_contributors_path) diff --git a/swh/graph/tests/test_origin_contributors.py b/swh/graph/tests/test_origin_contributors.py index 46189dc394fbb971f400eb36b8d0d5845179768f..20d7cd2365254e432864c71a851d98014a3dc92e 100644 --- a/swh/graph/tests/test_origin_contributors.py +++ b/swh/graph/tests/test_origin_contributors.py @@ -22,7 +22,7 @@ from swh.model.model import ( TimestampWithTimezone, ) -from .test_toposort import EXPECTED as TOPOLOGICAL_ORDER +from .test_toposort import EXPECTED_BACKWARD as TOPOLOGICAL_ORDER DATA_DIR = Path(__file__).parents[0] / "dataset" @@ -79,7 +79,10 @@ origin_id,contributor_base64,contributor_escaped def test_list_origin_contributors(tmpdir): tmpdir = Path(tmpdir) - topological_order_path = tmpdir / "topo_order.csv.zst" + topological_order_dir = tmpdir + topological_order_path = ( + topological_order_dir / "topological_order_dfs_backward_rev,rel,snp,ori.csv.zst" + ) origin_contributors_path = tmpdir / "origin_contributors.csv.zst" origin_urls_path = tmpdir / "origin_urls.csv.zst" @@ -91,7 +94,7 @@ def test_list_origin_contributors(tmpdir): task = ListOriginContributors( local_graph_path=DATA_DIR / "compressed", - topological_order_path=topological_order_path, + topological_order_dir=topological_order_dir, origin_contributors_path=origin_contributors_path, origin_urls_path=origin_urls_path, graph_name="example", diff --git a/swh/graph/tests/test_toposort.py b/swh/graph/tests/test_toposort.py index 67dde59276b61055dd698d3bfbd71f9ceb4e5009..15a8026c1514ca4ce6e669b11c24b35fb2e3f479 100644 --- a/swh/graph/tests/test_toposort.py +++ b/swh/graph/tests/test_toposort.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,6 +6,8 @@ from pathlib import Path import subprocess +import pytest + from swh.graph.luigi.misc_datasets import TopoSort DATA_DIR = Path(__file__).parents[0] / "dataset" @@ -14,7 +16,7 @@ DATA_DIR = Path(__file__).parents[0] / "dataset" # FIXME: the order of sample ancestors should not be hardcoded # FIXME: swh:1:snp:0000000000000000000000000000000000000022,3,1,swh has three possible # sample ancestors; they should not be hardecoded here -EXPECTED = """\ +EXPECTED_BACKWARD = """\ SWHID,ancestors,successors,sample_ancestor1,sample_ancestor2 swh:1:rev:0000000000000000000000000000000000000003,0,1,, swh:1:rev:0000000000000000000000000000000000000009,1,4,swh:1:rev:0000000000000000000000000000000000000003, @@ -29,15 +31,35 @@ swh:1:snp:0000000000000000000000000000000000000022,3,1,swh:1:rev:000000000000000 swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0,swh:1:snp:0000000000000000000000000000000000000022, """ +EXPECTED_FORWARD = """\ +SWHID,ancestors,successors,sample_ancestor1,sample_ancestor2 +swh:1:rel:0000000000000000000000000000000000000019,0,1,, +swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,0,1,, +swh:1:snp:0000000000000000000000000000000000000020,1,2,swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054, +swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,0,1,, +swh:1:snp:0000000000000000000000000000000000000022,1,3,swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165, +swh:1:rel:0000000000000000000000000000000000000021,1,1,swh:1:snp:0000000000000000000000000000000000000022, +swh:1:rev:0000000000000000000000000000000000000018,2,1,swh:1:rel:0000000000000000000000000000000000000021,swh:1:rel:0000000000000000000000000000000000000019 +swh:1:rev:0000000000000000000000000000000000000013,1,1,swh:1:rev:0000000000000000000000000000000000000018, +swh:1:rel:0000000000000000000000000000000000000010,2,1,swh:1:snp:0000000000000000000000000000000000000022,swh:1:snp:0000000000000000000000000000000000000020 +swh:1:rev:0000000000000000000000000000000000000009,4,1,swh:1:snp:0000000000000000000000000000000000000022,swh:1:rel:0000000000000000000000000000000000000010 +swh:1:rev:0000000000000000000000000000000000000003,1,0,swh:1:rev:0000000000000000000000000000000000000009, +""" + -def test_toposort(tmpdir): +@pytest.mark.parametrize("direction", ["backward", "forward"]) +def test_toposort(tmpdir, direction: str): tmpdir = Path(tmpdir) - topological_order_path = tmpdir / "topo_order.csv.zst" + topological_order_path = ( + tmpdir / f"topological_order_dfs_{direction}_rev,rel,snp,ori.csv.zst" + ) task = TopoSort( local_graph_path=DATA_DIR / "compressed", - topological_order_path=topological_order_path, + topological_order_dir=tmpdir, + direction=direction, + object_types="rev,rel,snp,ori", graph_name="example", ) @@ -45,23 +67,39 @@ def test_toposort(tmpdir): csv_text = subprocess.check_output(["zstdcat", topological_order_path]).decode() + expected = EXPECTED_BACKWARD if direction == "backward" else EXPECTED_FORWARD + (header, *rows) = csv_text.split("\n") - (expected_header, *expected_lines) = EXPECTED.split("\n") + (expected_header, *expected_lines) = expected.split("\n") assert header == expected_header - # The only possible first line - assert rows[0] == "swh:1:rev:0000000000000000000000000000000000000003,0,1,," - assert set(rows) == set(expected_lines) assert rows.pop() == "", "Missing trailing newline" - # The only three possible last lines - assert rows[-1] in [ - "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0" - ",swh:1:snp:0000000000000000000000000000000000000020,", - "swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0" - ",swh:1:snp:0000000000000000000000000000000000000022,", - "swh:1:rel:0000000000000000000000000000000000000019,1,0" - ",swh:1:rev:0000000000000000000000000000000000000018,", - ] + if direction == "backward": + # Only one possible first row + assert rows[0] == "swh:1:rev:0000000000000000000000000000000000000003,0,1,," + + # The only three possible last rows + assert rows[-1] in [ + "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,1,0" + ",swh:1:snp:0000000000000000000000000000000000000020,", + "swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,1,0" + ",swh:1:snp:0000000000000000000000000000000000000022,", + "swh:1:rel:0000000000000000000000000000000000000019,1,0" + ",swh:1:rev:0000000000000000000000000000000000000018,", + ] + else: + # Three possible first rows + assert rows[0] in [ + "swh:1:rel:0000000000000000000000000000000000000019,0,1,,", + "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054,0,1,,", + "swh:1:ori:8f50d3f60eae370ddbf85c86219c55108a350165,0,1,,", + ] + + # The only possible last row + assert rows[-1] == ( + "swh:1:rev:0000000000000000000000000000000000000003,1,0," + "swh:1:rev:0000000000000000000000000000000000000009," + )