`IllegalArgumentException: Input byte array has incorrect ending byte at 36` while reading filename label
Visited 6.23 B contents (out of 12.57 B nodes) <=> ]
Exception in thread "main" java.lang.IllegalArgumentException: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte at 36 ]
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
at java.base/java.util.concurrent.ForkJoinTask.getThrowableException(ForkJoinTask.java:562)
at java.base/java.util.concurrent.ForkJoinTask.reportException(ForkJoinTask.java:591)
at java.base/java.util.concurrent.ForkJoinTask.invoke(ForkJoinTask.java:689)
at java.base/java.util.stream.ForEachOps$ForEachOp.evaluateParallel(ForEachOps.java:159)
at java.base/java.util.stream.ForEachOps$ForEachOp$OfInt.evaluateParallel(ForEachOps.java:188)
at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:233)
at java.base/java.util.stream.IntPipeline.forEach(IntPipeline.java:463)
at java.base/java.util.stream.IntPipeline$Head.forEach(IntPipeline.java:620)
at org.softwareheritage.graph.utils.PopularContents.run(PopularContents.java:73)
at org.softwareheritage.graph.utils.PopularContents.main(PopularContents.java:55)
Caused by: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte at 36
at java.base/java.util.Base64$Decoder.decode0(Base64.java:875)
at java.base/java.util.Base64$Decoder.decode(Base64.java:566)
at org.softwareheritage.graph.SwhGraphProperties.getLabelName(SwhGraphProperties.java:333)
at org.softwareheritage.graph.utils.PopularContents.lambda$run$0(PopularContents.java:103)
at java.base/java.util.stream.ForEachOps$ForEachOp$OfInt.accept(ForEachOps.java:204)
at java.base/java.util.stream.Streams$RangeIntSpliterator.forEachRemaining(Streams.java:104)
at java.base/java.util.Spliterator$OfInt.forEachRemaining(Spliterator.java:711)
at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
at java.base/java.util.stream.ForEachOps$ForEachTask.compute(ForEachOps.java:290)
at java.base/java.util.concurrent.CountedCompleter.exec(CountedCompleter.java:754)
at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:373)
at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1182)
at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1655)
at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1622)
at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:165)
64.7M 26:33:10 [ 676 /s] [ <=> ]
triggered by this code:
/*
* Copyright (c) 2020-2023 The Software Heritage developers
* See the AUTHORS file at the top-level directory of this distribution
* License: GNU General Public License version 3, or any later version
* See top-level LICENSE file for more information
*/
package org.softwareheritage.graph.utils;
import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
import com.martiansoftware.jsap.*;
import org.softwareheritage.graph.*;
import org.softwareheritage.graph.labels.DirEntry;
import java.io.IOException;
import java.util.*;
import java.util.stream.IntStream;
import java.util.concurrent.atomic.AtomicLong;
/* Lists all nodes nodes of the types given as argument, in topological order,
* from leaves (contents, if selected) to the top (origins, if selected).
*
* This uses a DFS, so nodes are likely to be close to their neighbors.
*
* Some extra information is provided to allow more efficient consumption
* of the output: number of ancestors, successors, and a sample of two ancestors.
*
* Sample invocation:
*
* $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx500G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.PopularContents /dev/shm/swh-graph/default/graph \
* | pv --line-mode --wait \
* | zstdmt \
* > /poolswh/softwareheritage/vlorentz/2022-04-25_popular_contents.txt.zst
*/
public class PopularContents {
private SwhBidirectionalGraph graph;
private int NUM_THREADS = 96;
public static void main(String[] args) throws IOException, ClassNotFoundException {
if (args.length != 2) {
System.err.println(
"Syntax: java org.softwareheritage.graph.utils.PopularContents <path/to/graph> <popularity threshold>");
System.exit(1);
}
String graphPath = args[0];
long popularityThreshold = Long.parseLong(args[1]);
PopularContents popular_contents = new PopularContents();
popular_contents.loadGraph(graphPath);
popular_contents.run(popularityThreshold);
}
public void loadGraph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
graph = SwhBidirectionalGraph.loadLabelledMapped(graphBasename);
graph.properties.loadLabelNames();
System.err.println("Graph loaded.");
}
public void run(long popularityThreshold) {
System.out.format("SWHID,filename,occurrences\n");
long totalNodes = graph.numNodes();
AtomicLong totalVisited = new AtomicLong();
AtomicLong totalContentsVisited = new AtomicLong();
long chunkSize = totalNodes / NUM_THREADS;
IntStream.range(0, NUM_THREADS).parallel().forEach(threadId -> {
HashMap<String, Long> names = new HashMap<>();
SwhUnidirectionalGraph backwardGraph = graph.getBackwardGraph().copy();
long chunkStart = chunkSize * threadId;
long chunkEnd = threadId == NUM_THREADS - 1 ? totalNodes : chunkSize * (threadId + 1);
for (long cntNode = chunkStart; cntNode < chunkEnd; cntNode++) {
var total_visited = totalVisited.incrementAndGet();
if (total_visited % 10000000 == 0) {
float total_visited_f = total_visited;
float total_contents_visited_f = totalContentsVisited.floatValue();
System.err.printf("Visited %.02f B contents (out of %.02f B nodes)\n",
total_contents_visited_f / 1000000000., total_visited_f / 1000000000.);
}
if (graph.getNodeType(cntNode) != SwhType.CNT) {
continue;
}
totalContentsVisited.incrementAndGet();
names.clear();
ArcLabelledNodeIterator.LabelledArcIterator s = backwardGraph.labelledSuccessors(cntNode);
long dirNode;
while ((dirNode = s.nextLong()) >= 0) {
if (graph.getNodeType(dirNode) != SwhType.DIR) {
continue;
}
DirEntry[] labels = (DirEntry[]) s.label().get();
for (DirEntry label : labels) {
String filename = new String(graph.properties.getLabelName(label.filenameId));
names.put(filename, names.getOrDefault(filename, 0L) + 1);
}
}
for (Map.Entry<String, Long> entry : names.entrySet()) {
String name = entry.getKey();
Long count = entry.getValue();
if (count < popularityThreshold) {
continue;
}
System.out.format("%s,%s,%d\n", graph.getSWHID(cntNode), name, count);
}
}
});
}
}
which was generating the dataset in #4747 (closed) with a very simple heuristic