Compare revisions

vlorentz · Romain Lefeuvre · vlorentz · vlorentz · vlorentz · vlorentz
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -9,7 +9,7 @@ relatively simple traversal queries on the compressed graph.
 The client/server architecture allows it to only load the graph in memory once
 then serve multiple different requests. However, it is limited in expressivity;
 more complex or resource-intensive queries should rather use the
-:ref:`Low-level Java API <swh-graph-java-api>` to run them as standalone
+`Low-level Rust API <https://docs.rs/swh-graph/>`_ to run them as standalone
 programs.



--- a/docs/compression.rst
+++ b/docs/compression.rst
--- a/docs/memory.rst
+++ b/docs/memory.rst
@@ -24,45 +24,6 @@ using this environment variable::
    TMPDIR=/srv/softwareheritage/ssd/tmp


-Memory mapping vs Direct loading
--------------------------------
-
-The main dial you can use to manage your memory usage is to chose between
-memory-mapping and direct-loading the graph data. The different loading modes
-available when loading the graph are documented in :ref:`swh-graph-rust-api`.
-
-Loading in mapped mode will not load any extra data in RAM, but will instead
-use the ``mmap(1)`` syscall to put the graph file located on disk in the
-virtual address space. The Linux kernel will then be free to arbitrarily cache
-the file, either partially or in its entirety, depending on the available
-memory space.
-
-In our experiments, memory-mapping a small graph from a SSD only incurs a
-relatively small slowdown (about 15-20%). However, when the graph is too big to
-fit in RAM, the kernel has to constantly invalidate pages to cache newly
-accessed sections, which incurs a very large performance penalty. A full
-traversal of a large graph that usually takes about 20 hours when loaded in
-main memory could take more than a year when mapped from a hard drive!
-
-When deciding what to direct-load and what to memory-map, here are a few rules
-of thumb:
-
- If you don't need random access to the graph edges, you can consider using
-  the "offline" loading mode. The offsets won't be loaded which will save
-  dozens of gigabytes of RAM.
-
- If you only need to query some specific nodes or run trivial traversals,
-  memory-mapping the graph from a HDD should be a reasonable solution that
-  doesn't take an inordinate amount of time. It might be bad for your disks,
-  though.
-
- If you are constrained in available RAM, memory-mapping the graph from an SSD
-  offers reasonable performance for reasonably complex algorithms.
-
- If you have a heavy workload (i.e. running a full traversal of the entire
-  graph) and you can afford the RAM, direct loading will be orders of magnitude
-  faster than all the above options.
-

 Sharing mapped data across processes
 ------------------------------------

--- a/swh/graph/cli.py
+++ b/swh/graph/cli.py
@@ -235,7 +235,11 @@ def reindex(
    graph: str,
    debug: bool,
 ):
-    """Downloads a compressed SWH graph to the given target directory"""
+    """Reindex a SWH GRAPH to the latest graph format.
+
+    GRAPH should be composed of the graph folder followed by the graph prefix
+    (by default "graph") eg. "graph_folder/graph".
+    """
    import os.path

    from swh.graph.shell import Rust

--- a/swh/graph/luigi/compressed_graph.py
+++ b/swh/graph/luigi/compressed_graph.py
@@ -600,18 +600,9 @@ class Bv(_CompressionStepTask):
        return int(self._mph_size() + batch_size)


-class BvOffsets(_CompressionStepTask):
-    STEP = CompressionStep.BV_OFFSETS
-    INPUT_FILES = {"-base.graph"}
-    OUTPUT_FILES = {"-base.offsets"}
-
-    def _large_java_allocations(self) -> int:
-        return 0
-
-
 class BvEf(_CompressionStepTask):
    STEP = CompressionStep.BV_EF
-    INPUT_FILES = {"-base.offsets"}
+    INPUT_FILES = {"-base.graph"}
    OUTPUT_FILES = {"-base.ef"}

    def _large_java_allocations(self) -> int:
@@ -697,7 +688,7 @@ class Llp(_CompressionStepTask):

 class PermuteLlp(_CompressionStepTask):
    STEP = CompressionStep.PERMUTE_LLP
-    INPUT_FILES = {".pthash.order", "-base.graph", "-base.offsets"}
+    INPUT_FILES = {".pthash.order", "-base.graph", "-base.ef"}
    OUTPUT_FILES = {".graph", ".properties"}

    def _large_java_allocations(self) -> int:
@@ -915,7 +906,7 @@ class NodeProperties(_CompressionStepTask):


 class PthashLabels(_CompressionStepTask):
-    STEP = CompressionStep.PTHASH_LABELS
+    STEP = CompressionStep.MPH_LABELS
    INPUT_FILES = {".labels.csv.zst", ".labels.count.txt"}
    OUTPUT_FILES = {".labels.pthash"}

@@ -923,8 +914,8 @@ class PthashLabels(_CompressionStepTask):
        return 0


-class PthashLabelsOrder(_CompressionStepTask):
-    STEP = CompressionStep.PTHASH_LABELS_ORDER
+class LabelsOrder(_CompressionStepTask):
+    STEP = CompressionStep.LABELS_ORDER
    INPUT_FILES = {".labels.csv.zst", ".labels.pthash", ".labels.count.txt"}
    OUTPUT_FILES = {".labels.pthash.order"}


--- a/swh/graph/webgraph.py
+++ b/swh/graph/webgraph.py
@@ -41,9 +41,8 @@ class CompressionStep(Enum):
    LABEL_STATS = 6
    MPH = 10
    BV = 30
-    BV_OFFSETS = 40
-    BV_EF = 50
-    BFS_ROOTS = 55
+    BV_EF = 40
+    BFS_ROOTS = 50
    BFS = 60
    PERMUTE_AND_SIMPLIFY_BFS = 70
    BFS_EF = 80
@@ -61,8 +60,8 @@ class CompressionStep(Enum):
    PERSONS_STATS = 195
    MPH_PERSONS = 200
    NODE_PROPERTIES = 210
-    PTHASH_LABELS = 220
-    PTHASH_LABELS_ORDER = 225
+    MPH_LABELS = 220
+    LABELS_ORDER = 225
    FCL_LABELS = 230
    EDGE_LABELS = 240
    EDGE_LABELS_TRANSPOSE = 250
@@ -163,11 +162,6 @@ STEP_ARGV: Dict[CompressionStep, List[str]] = {
        "{in_dir}",
        "{out_dir}/{graph_name}-base",
    ],
-    CompressionStep.BV_OFFSETS: [
-        "{rust_executable_dir}/swh-graph-index",
-        "offsets",
-        "{out_dir}/{graph_name}-base",
-    ],
    CompressionStep.BV_EF: [
        "{rust_executable_dir}/swh-graph-index",
        "ef",
@@ -327,7 +321,7 @@ STEP_ARGV: Dict[CompressionStep, List[str]] = {
        "{in_dir}",
        "{out_dir}/{graph_name}",
    ],
-    CompressionStep.PTHASH_LABELS: [
+    CompressionStep.MPH_LABELS: [
        "{rust_executable_dir}/swh-graph-compress",
        "pthash-labels",
        "--num-labels",
@@ -335,7 +329,7 @@ STEP_ARGV: Dict[CompressionStep, List[str]] = {
        "<(zstdcat {out_dir}/{graph_name}.labels.csv.zst)",
        "{out_dir}/{graph_name}.labels.pthash",
    ],
-    CompressionStep.PTHASH_LABELS_ORDER: [
+    CompressionStep.LABELS_ORDER: [
        "{rust_executable_dir}/swh-graph-compress",
        "pthash-labels-order",
        "--num-labels",
No results found