Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-graph
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Antoine Lambert
swh-graph
Commits
1a7e84a9
Commit
1a7e84a9
authored
2 years ago
by
vlorentz
Browse files
Options
Downloads
Patches
Plain Diff
Add PopularContents script to generate a list of popular (swh:1:cnt, file_name) pairs
parent
13b01430
No related branches found
Branches containing commit
Tags
v6.3.1
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
+227
-0
227 additions, 0 deletions
...ava/org/softwareheritage/graph/utils/PopularContents.java
swh/graph/luigi/misc_datasets.py
+32
-0
32 additions, 0 deletions
swh/graph/luigi/misc_datasets.py
with
259 additions
and
0 deletions
java/src/main/java/org/softwareheritage/graph/utils/PopularContents.java
0 → 100644
+
227
−
0
View file @
1a7e84a9
/*
* Copyright (c) 2020-2023 The Software Heritage developers
* See the AUTHORS file at the top-level directory of this distribution
* License: GNU General Public License version 3, or any later version
* See top-level LICENSE file for more information
*/
package
org.softwareheritage.graph.utils
;
import
it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator
;
import
com.martiansoftware.jsap.*
;
import
org.softwareheritage.graph.*
;
import
org.softwareheritage.graph.labels.DirEntry
;
import
it.unimi.dsi.logging.ProgressLogger
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.stream.IntStream
;
import
java.util.concurrent.atomic.AtomicLong
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
/* Lists all nodes nodes of the types given as argument, in topological order,
* from leaves (contents, if selected) to the top (origins, if selected).
*
* This uses a DFS, so nodes are likely to be close to their neighbors.
*
* Some extra information is provided to allow more efficient consumption
* of the output: number of ancestors, successors, and a sample of two ancestors.
*
* Sample invocation:
*
* $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx500G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.PopularContents /dev/shm/swh-graph/default/graph \
* | pv --line-mode --wait \
* | zstdmt \
* > /poolswh/softwareheritage/vlorentz/2022-04-25_popular_contents.txt.zst
*/
public
class
PopularContents
{
private
SwhBidirectionalGraph
graph
;
private
int
NUM_THREADS
=
96
;
final
static
Logger
logger
=
LoggerFactory
.
getLogger
(
PopularContents
.
class
);
public
static
void
main
(
String
[]
args
)
throws
IOException
,
ClassNotFoundException
{
if
(
args
.
length
!=
3
)
{
System
.
err
.
println
(
"Syntax: java org.softwareheritage.graph.utils.PopularContents <path/to/graph> <max_results_per_cnt> <popularity_threshold>"
);
System
.
exit
(
1
);
}
String
graphPath
=
args
[
0
];
int
maxResults
=
Integer
.
parseInt
(
args
[
1
]);
long
popularityThreshold
=
Long
.
parseLong
(
args
[
2
]);
PopularContents
popular_contents
=
new
PopularContents
();
popular_contents
.
loadGraph
(
graphPath
);
popular_contents
.
run
(
maxResults
,
popularityThreshold
);
}
public
void
loadGraph
(
String
graphBasename
)
throws
IOException
{
System
.
err
.
println
(
"Loading graph "
+
graphBasename
+
" ..."
);
graph
=
SwhBidirectionalGraph
.
loadLabelledMapped
(
graphBasename
);
graph
.
properties
.
loadLabelNames
();
graph
.
properties
.
loadContentLength
();
System
.
err
.
println
(
"Graph loaded."
);
}
public
void
run
(
int
maxResults
,
long
popularityThreshold
)
{
System
.
out
.
format
(
"SWHID,length,filename,occurrences\n"
);
long
totalNodes
=
graph
.
numNodes
();
AtomicLong
totalVisited
=
new
AtomicLong
();
AtomicLong
totalContentsVisited
=
new
AtomicLong
();
ProgressLogger
pl
=
new
ProgressLogger
(
logger
);
pl
.
itemsName
=
"nodes"
;
pl
.
expectedUpdates
=
graph
.
numNodes
();
pl
.
start
(
"Listing contents..."
);
long
chunkSize
=
totalNodes
/
NUM_THREADS
;
IntStream
.
range
(
0
,
NUM_THREADS
).
parallel
().
forEach
(
threadId
->
{
HashMap
<
Long
,
Long
>
names
=
new
HashMap
<>();
SwhUnidirectionalGraph
backwardGraph
=
graph
.
getBackwardGraph
().
copy
();
long
chunkStart
=
chunkSize
*
threadId
;
long
chunkEnd
=
threadId
==
NUM_THREADS
-
1
?
totalNodes
:
chunkSize
*
(
threadId
+
1
);
/*
* priority heap used to only print filenames with the most occurrences for each content
*/
PriorityQueue
<
Long
>
heap
=
new
PriorityQueue
<
Long
>((
maxResults
>
0
)
?
maxResults
:
1
,
new
SortByHashmap
(
names
));
for
(
long
cntNode
=
chunkStart
;
cntNode
<
chunkEnd
;
cntNode
++)
{
pl
.
update
();
if
(
graph
.
getNodeType
(
cntNode
)
!=
SwhType
.
CNT
)
{
continue
;
}
totalContentsVisited
.
incrementAndGet
();
names
.
clear
();
ArcLabelledNodeIterator
.
LabelledArcIterator
s
=
backwardGraph
.
labelledSuccessors
(
cntNode
);
long
dirNode
;
while
((
dirNode
=
s
.
nextLong
())
>=
0
)
{
if
(
graph
.
getNodeType
(
dirNode
)
!=
SwhType
.
DIR
)
{
continue
;
}
DirEntry
[]
labels
=
(
DirEntry
[])
s
.
label
().
get
();
for
(
DirEntry
label
:
labels
)
{
names
.
put
(
label
.
filenameId
,
names
.
getOrDefault
(
label
.
filenameId
,
0L
)
+
1
);
}
}
Long
contentLength
=
graph
.
properties
.
getContentLength
(
cntNode
);
if
(
contentLength
==
null
)
{
contentLength
=
-
1L
;
}
if
(
names
.
size
()
==
0
)
{
/* No filename at all */
continue
;
}
else
if
(
maxResults
<=
0
||
maxResults
>=
names
.
size
())
{
/* Print everything */
for
(
Map
.
Entry
<
Long
,
Long
>
entry
:
names
.
entrySet
())
{
long
filenameId
=
entry
.
getKey
();
Long
count
=
entry
.
getValue
();
if
(
count
<
popularityThreshold
)
{
continue
;
}
String
filename
=
getFilename
(
filenameId
,
dirNode
);
if
(
filename
==
null
)
{
continue
;
}
System
.
out
.
format
(
"%s,%d,%s,%d\n"
,
graph
.
getSWHID
(
cntNode
),
contentLength
,
filename
,
count
);
}
}
else
if
(
maxResults
==
1
)
{
/*
* Print only the result with the most occurrence. This case could be merged with the one below, but
* avoiding the priority heap has much better performance.
*/
long
maxFilenameId
=
0
;
long
maxCount
=
0
;
for
(
Map
.
Entry
<
Long
,
Long
>
entry
:
names
.
entrySet
())
{
Long
count
=
entry
.
getValue
();
if
(
count
>
maxCount
)
{
maxFilenameId
=
entry
.
getKey
();
maxCount
=
count
;
}
}
if
(
maxCount
>
0
)
{
String
filename
=
getFilename
(
maxFilenameId
,
dirNode
);
if
(
filename
==
null
)
{
continue
;
}
System
.
out
.
format
(
"%s,%d,%s,%d\n"
,
graph
.
getSWHID
(
cntNode
),
contentLength
,
filename
,
maxCount
);
}
}
else
{
/* Print only results with the most occurrences */
int
nbResultsInHeap
=
0
;
for
(
Map
.
Entry
<
Long
,
Long
>
entry
:
names
.
entrySet
())
{
Long
filenameId
=
entry
.
getKey
();
Long
count
=
entry
.
getValue
();
if
(
count
<
popularityThreshold
)
{
continue
;
}
heap
.
add
(
filenameId
);
if
(
nbResultsInHeap
==
maxResults
)
{
heap
.
poll
();
}
else
{
nbResultsInHeap
++;
}
}
for
(
Long
filenameId
:
heap
)
{
String
filename
=
getFilename
(
filenameId
,
dirNode
);
if
(
filename
==
null
)
{
continue
;
}
System
.
out
.
format
(
"%s,%d,%s,%d\n"
,
graph
.
getSWHID
(
cntNode
),
contentLength
,
filename
,
names
.
get
(
filenameId
));
}
heap
.
clear
();
}
}
});
pl
.
done
();
}
private
String
getFilename
(
long
filenameId
,
long
dirNode
)
{
try
{
return
new
String
(
graph
.
properties
.
getLabelName
(
filenameId
));
}
catch
(
IllegalArgumentException
e
)
{
/*
* https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4759
*
* Caused by: java.lang.IllegalArgumentException: Input byte array has incorrect ending byte at 36
* at java.base/java.util.Base64$Decoder.decode0(Base64.java:875) at
* java.base/java.util.Base64$Decoder.decode(Base64.java:566) at
* org.softwareheritage.graph.SwhGraphProperties.getLabelName(SwhGraphProperties.java:333) at
* org.softwareheritage.graph.utils.PopularContents.lambda$run$0(PopularContents.java:103)
*/
System
.
err
.
printf
(
"Failed to read filename %d of directory %s: %s\n"
,
filenameId
,
graph
.
getSWHID
(
dirNode
),
e
.
toString
());
return
null
;
}
}
private
class
SortByHashmap
implements
Comparator
<
Long
>
{
private
HashMap
<
Long
,
Long
>
map
;
public
SortByHashmap
(
HashMap
<
Long
,
Long
>
map
)
{
this
.
map
=
map
;
}
public
int
compare
(
Long
l1
,
Long
l2
)
{
return
map
.
get
(
l1
).
compareTo
(
map
.
get
(
l2
));
}
}
}
This diff is collapsed.
Click to expand it.
swh/graph/luigi/misc_datasets.py
+
32
−
0
View file @
1a7e84a9
...
...
@@ -82,3 +82,35 @@ class TopoSort(luigi.Task):
| zstdmt -19
"""
# noqa
run_script
(
script
,
Path
(
self
.
output
().
path
))
class
PopularContents
(
luigi
.
Task
):
"""
Creates a file that contains all SWHIDs in topological order from a compressed
graph.
"""
local_graph_path
=
luigi
.
PathParameter
()
popular_contents_path
=
luigi
.
PathParameter
()
graph_name
=
luigi
.
Parameter
(
default
=
"
graph
"
)
max_results_per_content
=
luigi
.
IntParameter
(
default
=
0
)
popularity_threshold
=
luigi
.
IntParameter
(
default
=
0
)
max_ram
=
luigi
.
Parameter
(
default
=
"
300G
"
)
def
requires
(
self
)
->
List
[
luigi
.
Task
]:
"""
Returns an instance of :class:`LocalGraph`.
"""
return
[
LocalGraph
(
local_graph_path
=
self
.
local_graph_path
)]
def
output
(
self
)
->
luigi
.
Target
:
"""
.csv.zst file that contains the topological order.
"""
return
luigi
.
LocalTarget
(
self
.
popular_contents_path
)
def
run
(
self
)
->
None
:
"""
Runs org.softwareheritage.graph.utils.PopularContents and compresses
"""
class_name
=
"
org.softwareheritage.graph.utils.PopularContents
"
# TODO: pass max_ram to run_script() correctly so it can pass it to
# check_config(), instead of hardcoding it on the command line here
script
=
f
"""
java -Xmx
{
self
.
max_ram
}
{
class_name
}
'
{
self
.
local_graph_path
}
/
{
self
.
graph_name
}
'
'
{
self
.
max_results_per_content
}
'
'
{
self
.
popularity_threshold
}
'
\
| pv --line-mode --wait
\
| zstdmt -19
"""
# noqa
run_script
(
script
,
Path
(
self
.
output
().
path
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment