From ac7e3834d3e27676f73ee92bc8f5fb2cbefc0818 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 13 Jan 2023 10:54:31 +0100
Subject: [PATCH 1/8] Add PopularContents to generate a list of popular
 (swh:1:cnt, file_name) pairs

---
 .../graph/utils/PopularContents.java          | 119 ++++++++++++++++++
 swh/graph/luigi/misc_datasets.py              |  31 +++++
 2 files changed, 150 insertions(+)
 create mode 100644 java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
new file mode 100644
index 000000000..ab94ddd1f
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2023 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.utils;
+
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
+import com.martiansoftware.jsap.*;
+import org.softwareheritage.graph.*;
+import org.softwareheritage.graph.labels.DirEntry;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.IntStream;
+import java.util.concurrent.atomic.AtomicLong;
+
+/* Lists all nodes nodes of the types given as argument, in topological order,
+ * from leaves (contents, if selected) to the top (origins, if selected).
+ *
+ * This uses a DFS, so nodes are likely to be close to their neighbors.
+ *
+ * Some extra information is provided to allow more efficient consumption
+ * of the output: number of ancestors, successors, and a sample of two ancestors.
+ *
+ * Sample invocation:
+ *
+ *   $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx500G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.PopularContents /dev/shm/swh-graph/default/graph \
+ *      | pv --line-mode --wait \
+ *      | zstdmt \
+ *      > /poolswh/softwareheritage/vlorentz/2022-04-25_popular_contents.txt.zst
+ */
+
+public class PopularContents {
+    private SwhBidirectionalGraph graph;
+    private int NUM_THREADS = 96;
+
+    public static void main(String[] args) throws IOException, ClassNotFoundException {
+        if (args.length != 2) {
+            System.err.println(
+                    "Syntax: java org.softwareheritage.graph.utils.PopularContents <path/to/graph> <popularity threshold>");
+            System.exit(1);
+        }
+        String graphPath = args[0];
+        long popularityThreshold = Long.parseLong(args[1]);
+
+        PopularContents popular_contents = new PopularContents();
+
+        popular_contents.loadGraph(graphPath);
+
+        popular_contents.run(popularityThreshold);
+    }
+
+    public void loadGraph(String graphBasename) throws IOException {
+        System.err.println("Loading graph " + graphBasename + " ...");
+        graph = SwhBidirectionalGraph.loadLabelledMapped(graphBasename);
+        graph.properties.loadLabelNames();
+        System.err.println("Graph loaded.");
+    }
+
+    public void run(long popularityThreshold) {
+        System.out.format("SWHID,filename,occurrences\n");
+
+        long totalNodes = graph.numNodes();
+        AtomicLong totalVisited = new AtomicLong();
+        AtomicLong totalContentsVisited = new AtomicLong();
+
+        long chunkSize = totalNodes / NUM_THREADS;
+        IntStream.range(0, NUM_THREADS).parallel().forEach(threadId -> {
+            HashMap<String, Long> names = new HashMap<>();
+            SwhUnidirectionalGraph backwardGraph = graph.getBackwardGraph().copy();
+            long chunkStart = chunkSize * threadId;
+            long chunkEnd = threadId == NUM_THREADS - 1 ? totalNodes : chunkSize * (threadId + 1);
+            for (long cntNode = chunkStart; cntNode < chunkEnd; cntNode++) {
+                var total_visited = totalVisited.incrementAndGet();
+                if (total_visited % 10000000 == 0) {
+                    float total_visited_f = total_visited;
+                    float total_contents_visited_f = totalContentsVisited.floatValue();
+                    System.err.printf("Visited %.02f B contents (out of %.02f B nodes)\n",
+                            total_contents_visited_f / 1000000000., total_visited_f / 1000000000.);
+                }
+
+                if (graph.getNodeType(cntNode) != SwhType.CNT) {
+                    continue;
+                }
+
+                totalContentsVisited.incrementAndGet();
+
+                names.clear();
+
+                ArcLabelledNodeIterator.LabelledArcIterator s = backwardGraph.labelledSuccessors(cntNode);
+                long dirNode;
+                while ((dirNode = s.nextLong()) >= 0) {
+                    if (graph.getNodeType(dirNode) != SwhType.DIR) {
+                        continue;
+                    }
+                    DirEntry[] labels = (DirEntry[]) s.label().get();
+                    for (DirEntry label : labels) {
+                        String filename = new String(graph.properties.getLabelName(label.filenameId));
+                        names.put(filename, names.getOrDefault(filename, 0L) + 1);
+
+                    }
+                }
+
+                for (Map.Entry<String, Long> entry : names.entrySet()) {
+                    String name = entry.getKey();
+                    Long count = entry.getValue();
+                    if (count < popularityThreshold) {
+                        continue;
+                    }
+                    System.out.format("%s,%s,%d\n", graph.getSWHID(cntNode), name, count);
+                }
+            }
+        });
+
+    }
+}
diff --git a/swh/graph/luigi/misc_datasets.py b/swh/graph/luigi/misc_datasets.py
index a1615b98a..7335d7c21 100644
--- a/swh/graph/luigi/misc_datasets.py
+++ b/swh/graph/luigi/misc_datasets.py
@@ -82,3 +82,34 @@ class TopoSort(luigi.Task):
             | zstdmt -19
         """  # noqa
         run_script(script, Path(self.output().path))
+
+
+class PopularContents(luigi.Task):
+    """Creates a file that contains all SWHIDs in topological order from a compressed
+    graph."""
+
+    local_graph_path = luigi.PathParameter()
+    popular_contents_path = luigi.PathParameter()
+    graph_name = luigi.Parameter(default="graph")
+    popularity_threshold = luigi.IntParameter(default=100)
+    max_ram = luigi.Parameter(default="300G")
+
+    def requires(self) -> List[luigi.Task]:
+        """Returns an instance of :class:`LocalGraph`."""
+        return [LocalGraph(local_graph_path=self.local_graph_path)]
+
+    def output(self) -> luigi.Target:
+        """.csv.zst file that contains the topological order."""
+        return luigi.LocalTarget(self.popular_contents_path)
+
+    def run(self) -> None:
+        """Runs org.softwareheritage.graph.utils.PopularContents and compresses"""
+        class_name = "org.softwareheritage.graph.utils.PopularContents"
+        # TODO: pass max_ram to run_script() correctly so it can pass it to
+        # check_config(), instead of hardcoding it on the command line here
+        script = f"""
+        java -Xmx{self.max_ram} {class_name} '{self.local_graph_path}/{self.graph_name}'  '{self.popularity_threshold}' \
+            | pv --line-mode --wait \
+            | zstdmt -19
+        """  # noqa
+        run_script(script, Path(self.output().path))
-- 
GitLab


From f7eb7dbb99e14f02413ba9aaf9c3632c027b7366 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 17 Jan 2023 15:23:06 +0100
Subject: [PATCH 2/8] PopularContents: Add length column

---
 .../softwareheritage/graph/utils/PopularContents.java    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index ab94ddd1f..cbd657cc7 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -57,11 +57,12 @@ public class PopularContents {
         System.err.println("Loading graph " + graphBasename + " ...");
         graph = SwhBidirectionalGraph.loadLabelledMapped(graphBasename);
         graph.properties.loadLabelNames();
+        graph.properties.loadContentLength();
         System.err.println("Graph loaded.");
     }
 
     public void run(long popularityThreshold) {
-        System.out.format("SWHID,filename,occurrences\n");
+        System.out.format("SWHID,length,filename,occurrences\n");
 
         long totalNodes = graph.numNodes();
         AtomicLong totalVisited = new AtomicLong();
@@ -104,13 +105,17 @@ public class PopularContents {
                     }
                 }
 
+                Long contentLength = graph.properties.getContentLength(cntNode);
+                if (contentLength == null) {
+                    contentLength = -1L;
+                }
                 for (Map.Entry<String, Long> entry : names.entrySet()) {
                     String name = entry.getKey();
                     Long count = entry.getValue();
                     if (count < popularityThreshold) {
                         continue;
                     }
-                    System.out.format("%s,%s,%d\n", graph.getSWHID(cntNode), name, count);
+                    System.out.format("%s,%d,%s,%d\n", graph.getSWHID(cntNode), contentLength, name, count);
                 }
             }
         });
-- 
GitLab


From 85ff1b6617ff4f8abcbb7a5221e2043bc9b6b33d Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 17 Jan 2023 15:24:27 +0100
Subject: [PATCH 3/8] PopularContents: Add workaround for crash on filename
 decoding

---
 .../graph/utils/PopularContents.java          | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index cbd657cc7..9429eef72 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -99,7 +99,25 @@ public class PopularContents {
                     }
                     DirEntry[] labels = (DirEntry[]) s.label().get();
                     for (DirEntry label : labels) {
-                        String filename = new String(graph.properties.getLabelName(label.filenameId));
+                        String filename;
+                        try {
+                            filename = new String(graph.properties.getLabelName(label.filenameId));
+                        } catch (IllegalArgumentException e) {
+                            /*
+                             * https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4759
+                             *
+                             * Caused by: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte
+                             * at 36 at java.base/java.util.Base64$Decoder.decode0(Base64.java:875) at
+                             * java.base/java.util.Base64$Decoder.decode(Base64.java:566) at
+                             * org.softwareheritage.graph.SwhGraphProperties.getLabelName(SwhGraphProperties.java:333)
+                             * at
+                             * org.softwareheritage.graph.utils.PopularContents.lambda$run$0(PopularContents.java:103)
+                             */
+
+                            System.err.printf("Failed to read filename %d of content %s: %s\n", label.filenameId,
+                                    graph.getSWHID(cntNode), e.toString());
+                            continue;
+                        }
                         names.put(filename, names.getOrDefault(filename, 0L) + 1);
 
                     }
-- 
GitLab


From f1afdfac5938f4c8f1b79a261c2f24ae13c5b1d1 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Mon, 23 Jan 2023 15:45:01 +0100
Subject: [PATCH 4/8] PopularContents: Use ProgressLogger instead of custom
 code

---
 .../graph/utils/PopularContents.java             | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index 9429eef72..dad1f4ced 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -11,6 +11,7 @@ import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
 import com.martiansoftware.jsap.*;
 import org.softwareheritage.graph.*;
 import org.softwareheritage.graph.labels.DirEntry;
+import it.unimi.dsi.logging.ProgressLogger;
 
 import java.io.IOException;
 import java.util.*;
@@ -68,6 +69,11 @@ public class PopularContents {
         AtomicLong totalVisited = new AtomicLong();
         AtomicLong totalContentsVisited = new AtomicLong();
 
+        ProgressLogger pl = new ProgressLogger();
+        pl.itemsName = "content";
+        pl.expectedUpdates = graph.numNodes();
+        pl.start("Listing contents...");
+
         long chunkSize = totalNodes / NUM_THREADS;
         IntStream.range(0, NUM_THREADS).parallel().forEach(threadId -> {
             HashMap<String, Long> names = new HashMap<>();
@@ -75,13 +81,7 @@ public class PopularContents {
             long chunkStart = chunkSize * threadId;
             long chunkEnd = threadId == NUM_THREADS - 1 ? totalNodes : chunkSize * (threadId + 1);
             for (long cntNode = chunkStart; cntNode < chunkEnd; cntNode++) {
-                var total_visited = totalVisited.incrementAndGet();
-                if (total_visited % 10000000 == 0) {
-                    float total_visited_f = total_visited;
-                    float total_contents_visited_f = totalContentsVisited.floatValue();
-                    System.err.printf("Visited %.02f B contents (out of %.02f B nodes)\n",
-                            total_contents_visited_f / 1000000000., total_visited_f / 1000000000.);
-                }
+                pl.update();
 
                 if (graph.getNodeType(cntNode) != SwhType.CNT) {
                     continue;
@@ -138,5 +138,7 @@ public class PopularContents {
             }
         });
 
+        pl.done();
+
     }
 }
-- 
GitLab


From e0f3e2ec653b940168ba89558df91f29b1cf43b4 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Mon, 23 Jan 2023 15:46:44 +0100
Subject: [PATCH 5/8] PopularContents: Add the option to only show the top
 name(s) of each content

---
 .../graph/utils/PopularContents.java          | 143 ++++++++++++++----
 swh/graph/luigi/misc_datasets.py              |   5 +-
 2 files changed, 114 insertions(+), 34 deletions(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index dad1f4ced..796f9d897 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -39,19 +39,20 @@ public class PopularContents {
     private int NUM_THREADS = 96;
 
     public static void main(String[] args) throws IOException, ClassNotFoundException {
-        if (args.length != 2) {
+        if (args.length != 3) {
             System.err.println(
-                    "Syntax: java org.softwareheritage.graph.utils.PopularContents <path/to/graph> <popularity threshold>");
+                    "Syntax: java org.softwareheritage.graph.utils.PopularContents <path/to/graph> <max_results_per_cnt> <popularity_threshold>");
             System.exit(1);
         }
         String graphPath = args[0];
-        long popularityThreshold = Long.parseLong(args[1]);
+        int maxResults = Integer.parseInt(args[1]);
+        long popularityThreshold = Long.parseLong(args[2]);
 
         PopularContents popular_contents = new PopularContents();
 
         popular_contents.loadGraph(graphPath);
 
-        popular_contents.run(popularityThreshold);
+        popular_contents.run(maxResults, popularityThreshold);
     }
 
     public void loadGraph(String graphBasename) throws IOException {
@@ -62,7 +63,7 @@ public class PopularContents {
         System.err.println("Graph loaded.");
     }
 
-    public void run(long popularityThreshold) {
+    public void run(int maxResults, long popularityThreshold) {
         System.out.format("SWHID,length,filename,occurrences\n");
 
         long totalNodes = graph.numNodes();
@@ -76,10 +77,17 @@ public class PopularContents {
 
         long chunkSize = totalNodes / NUM_THREADS;
         IntStream.range(0, NUM_THREADS).parallel().forEach(threadId -> {
-            HashMap<String, Long> names = new HashMap<>();
+            HashMap<Long, Long> names = new HashMap<>();
             SwhUnidirectionalGraph backwardGraph = graph.getBackwardGraph().copy();
             long chunkStart = chunkSize * threadId;
             long chunkEnd = threadId == NUM_THREADS - 1 ? totalNodes : chunkSize * (threadId + 1);
+
+            /*
+             * priority heap used to only print filenames with the most occurrences for each content
+             */
+            PriorityQueue<Long> heap = new PriorityQueue<Long>((maxResults > 0) ? maxResults : 1,
+                    new SortByHashmap(names));
+
             for (long cntNode = chunkStart; cntNode < chunkEnd; cntNode++) {
                 pl.update();
 
@@ -99,26 +107,7 @@ public class PopularContents {
                     }
                     DirEntry[] labels = (DirEntry[]) s.label().get();
                     for (DirEntry label : labels) {
-                        String filename;
-                        try {
-                            filename = new String(graph.properties.getLabelName(label.filenameId));
-                        } catch (IllegalArgumentException e) {
-                            /*
-                             * https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4759
-                             *
-                             * Caused by: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte
-                             * at 36 at java.base/java.util.Base64$Decoder.decode0(Base64.java:875) at
-                             * java.base/java.util.Base64$Decoder.decode(Base64.java:566) at
-                             * org.softwareheritage.graph.SwhGraphProperties.getLabelName(SwhGraphProperties.java:333)
-                             * at
-                             * org.softwareheritage.graph.utils.PopularContents.lambda$run$0(PopularContents.java:103)
-                             */
-
-                            System.err.printf("Failed to read filename %d of content %s: %s\n", label.filenameId,
-                                    graph.getSWHID(cntNode), e.toString());
-                            continue;
-                        }
-                        names.put(filename, names.getOrDefault(filename, 0L) + 1);
+                        names.put(label.filenameId, names.getOrDefault(label.filenameId, 0L) + 1);
 
                     }
                 }
@@ -127,13 +116,72 @@ public class PopularContents {
                 if (contentLength == null) {
                     contentLength = -1L;
                 }
-                for (Map.Entry<String, Long> entry : names.entrySet()) {
-                    String name = entry.getKey();
-                    Long count = entry.getValue();
-                    if (count < popularityThreshold) {
-                        continue;
+                if (names.size() == 0) {
+                    /* No filename at all */
+                    continue;
+                } else if (maxResults <= 0 || maxResults >= names.size()) {
+                    /* Print everything */
+                    for (Map.Entry<Long, Long> entry : names.entrySet()) {
+                        long filenameId = entry.getKey();
+                        Long count = entry.getValue();
+                        if (count < popularityThreshold) {
+                            continue;
+                        }
+                        String filename = getFilename(filenameId, dirNode);
+                        if (filename == null) {
+                            continue;
+                        }
+                        System.out.format("%s,%d,%s,%d\n", graph.getSWHID(cntNode), contentLength, filename, count);
+                    }
+                } else if (maxResults == 1) {
+                    /*
+                     * Print only the result with the most occurrence. This case could be merged with the one below, but
+                     * avoiding the priority heap has much better performance.
+                     */
+                    long maxFilenameId = 0;
+                    long maxCount = 0;
+
+                    for (Map.Entry<Long, Long> entry : names.entrySet()) {
+                        Long count = entry.getValue();
+                        if (count > maxCount) {
+                            maxFilenameId = entry.getKey();
+                            maxCount = count;
+                        }
+                    }
+
+                    if (maxCount > 0) {
+                        String filename = getFilename(maxFilenameId, dirNode);
+                        if (filename == null) {
+                            continue;
+                        }
+                        System.out.format("%s,%d,%s,%d\n", graph.getSWHID(cntNode), contentLength, filename, maxCount);
                     }
-                    System.out.format("%s,%d,%s,%d\n", graph.getSWHID(cntNode), contentLength, name, count);
+                } else {
+                    /* Print only results with the most occurrences */
+                    int nbResultsInHeap = 0;
+                    for (Map.Entry<Long, Long> entry : names.entrySet()) {
+                        Long filenameId = entry.getKey();
+                        Long count = entry.getValue();
+                        if (count < popularityThreshold) {
+                            continue;
+                        }
+                        heap.add(filenameId);
+                        if (nbResultsInHeap == maxResults) {
+                            heap.poll();
+                        } else {
+                            nbResultsInHeap++;
+                        }
+                    }
+
+                    for (Long filenameId : heap) {
+                        String filename = getFilename(filenameId, dirNode);
+                        if (filename == null) {
+                            continue;
+                        }
+                        System.out.format("%s,%d,%s,%d\n", graph.getSWHID(cntNode), contentLength, filename,
+                                names.get(filenameId));
+                    }
+                    heap.clear();
                 }
             }
         });
@@ -141,4 +189,35 @@ public class PopularContents {
         pl.done();
 
     }
+
+    private String getFilename(long filenameId, long dirNode) {
+        try {
+            return new String(graph.properties.getLabelName(filenameId));
+        } catch (IllegalArgumentException e) {
+            /*
+             * https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4759
+             *
+             * Caused by: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte at 36
+             * at java.base/java.util.Base64$Decoder.decode0(Base64.java:875) at
+             * java.base/java.util.Base64$Decoder.decode(Base64.java:566) at
+             * org.softwareheritage.graph.SwhGraphProperties.getLabelName(SwhGraphProperties.java:333) at
+             * org.softwareheritage.graph.utils.PopularContents.lambda$run$0(PopularContents.java:103)
+             */
+
+            System.err.printf("Failed to read filename %d of directory %s: %s\n", filenameId, graph.getSWHID(dirNode),
+                    e.toString());
+            return null;
+        }
+    }
+
+    private class SortByHashmap implements Comparator<Long> {
+        private HashMap<Long, Long> map;
+        public SortByHashmap(HashMap<Long, Long> map) {
+            this.map = map;
+        }
+
+        public int compare(Long l1, Long l2) {
+            return map.get(l1).compareTo(map.get(l2));
+        }
+    }
 }
diff --git a/swh/graph/luigi/misc_datasets.py b/swh/graph/luigi/misc_datasets.py
index 7335d7c21..7f500628b 100644
--- a/swh/graph/luigi/misc_datasets.py
+++ b/swh/graph/luigi/misc_datasets.py
@@ -91,7 +91,8 @@ class PopularContents(luigi.Task):
     local_graph_path = luigi.PathParameter()
     popular_contents_path = luigi.PathParameter()
     graph_name = luigi.Parameter(default="graph")
-    popularity_threshold = luigi.IntParameter(default=100)
+    max_results_per_content = luigi.IntParameter(default=0)
+    popularity_threshold = luigi.IntParameter(default=0)
     max_ram = luigi.Parameter(default="300G")
 
     def requires(self) -> List[luigi.Task]:
@@ -108,7 +109,7 @@ class PopularContents(luigi.Task):
         # TODO: pass max_ram to run_script() correctly so it can pass it to
         # check_config(), instead of hardcoding it on the command line here
         script = f"""
-        java -Xmx{self.max_ram} {class_name} '{self.local_graph_path}/{self.graph_name}'  '{self.popularity_threshold}' \
+        java -Xmx{self.max_ram} {class_name} '{self.local_graph_path}/{self.graph_name}'  '{self.max_results_per_content}' '{self.popularity_threshold}' \
             | pv --line-mode --wait \
             | zstdmt -19
         """  # noqa
-- 
GitLab


From e79b46d93e27eab98ebed62b01db9f991b1ea0df Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 24 Jan 2023 11:51:29 +0100
Subject: [PATCH 6/8] PopularContents: Make ProgressLogger display the class
 name

---
 .../org/softwareheritage/graph/utils/PopularContents.java   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index 796f9d897..08f33599a 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -17,6 +17,8 @@ import java.io.IOException;
 import java.util.*;
 import java.util.stream.IntStream;
 import java.util.concurrent.atomic.AtomicLong;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /* Lists all nodes nodes of the types given as argument, in topological order,
  * from leaves (contents, if selected) to the top (origins, if selected).
@@ -38,6 +40,8 @@ public class PopularContents {
     private SwhBidirectionalGraph graph;
     private int NUM_THREADS = 96;
 
+    final static Logger logger = LoggerFactory.getLogger(PopularContents.class);
+
     public static void main(String[] args) throws IOException, ClassNotFoundException {
         if (args.length != 3) {
             System.err.println(
@@ -70,7 +74,7 @@ public class PopularContents {
         AtomicLong totalVisited = new AtomicLong();
         AtomicLong totalContentsVisited = new AtomicLong();
 
-        ProgressLogger pl = new ProgressLogger();
+        ProgressLogger pl = new ProgressLogger(logger);
         pl.itemsName = "content";
         pl.expectedUpdates = graph.numNodes();
         pl.start("Listing contents...");
-- 
GitLab


From 70586134f7d50b1f178e9ca6904e97700a1244b2 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 24 Jan 2023 13:39:56 +0100
Subject: [PATCH 7/8] PopularContents: Fix plural

---
 .../java/org/softwareheritage/graph/utils/PopularContents.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index 08f33599a..8078f6ce5 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -75,7 +75,7 @@ public class PopularContents {
         AtomicLong totalContentsVisited = new AtomicLong();
 
         ProgressLogger pl = new ProgressLogger(logger);
-        pl.itemsName = "content";
+        pl.itemsName = "contents";
         pl.expectedUpdates = graph.numNodes();
         pl.start("Listing contents...");
 
-- 
GitLab


From 27e375733c3bdcd97e4f6ed60152718f5d4b805e Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 27 Jan 2023 15:20:45 +0100
Subject: [PATCH 8/8] PopularContents: Fix item name in logs

---
 .../java/org/softwareheritage/graph/utils/PopularContents.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
index 8078f6ce5..0b545febe 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+++ b/java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
@@ -75,7 +75,7 @@ public class PopularContents {
         AtomicLong totalContentsVisited = new AtomicLong();
 
         ProgressLogger pl = new ProgressLogger(logger);
-        pl.itemsName = "contents";
+        pl.itemsName = "nodes";
         pl.expectedUpdates = graph.numNodes();
         pl.start("Listing contents...");
 
-- 
GitLab