Skip to content
Snippets Groups Projects
Commit 02d59dd7 authored by Tommaso Fontana's avatar Tommaso Fontana
Browse files

added Node2Type, its tests, and an bin to convert the .node2swhid.bin file to...

added Node2Type, its tests, and an bin to convert the .node2swhid.bin file to the new .node2type.bin
parent 39edd560
No related branches found
No related tags found
1 merge request!326Add Node2Type and a bin to convert the .node2swhid.bin file to the new .node2type.bin
Pipeline #3492 passed
......@@ -4,9 +4,9 @@ version = 3
[[package]]
name = "addr2line"
version = "0.19.0"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
dependencies = [
"gimli",
]
......@@ -124,9 +124,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "backtrace"
version = "0.3.67"
version = "0.3.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
dependencies = [
"addr2line",
"cc",
......@@ -156,7 +156,7 @@ dependencies = [
"regex",
"rustc-hash",
"shlex",
"syn 2.0.18",
"syn 2.0.23",
"which",
]
......@@ -168,9 +168,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.3.2"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dbe3c979c178231552ecba20214a8272df4e09f232a87aef4320cf06539aded"
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
[[package]]
name = "bitvec"
......@@ -259,9 +259,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.3.5"
version = "4.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2686c4115cb0810d9a984776e197823d08ec94f176549a89a9efded477c456dc"
checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
dependencies = [
"clap_builder",
"clap_derive",
......@@ -270,13 +270,12 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.3.5"
version = "4.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae"
checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
dependencies = [
"anstream",
"anstyle",
"bitflags 1.3.2",
"clap_lex",
"strsim",
]
......@@ -290,7 +289,7 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.18",
"syn 2.0.23",
]
[[package]]
......@@ -367,7 +366,7 @@ dependencies = [
[[package]]
name = "dsi-bitstream"
version = "0.1.0"
source = "git+https://github.com/vigna/dsi-bitstream-rs#29e5c3f40a42cccb00ce08600b0de31fb4d41b7c"
source = "git+https://github.com/vigna/dsi-bitstream-rs#380b90846cf5d8229910fd96b5c3305af8b4deb0"
dependencies = [
"anyhow",
"rand",
......@@ -543,18 +542,9 @@ dependencies = [
[[package]]
name = "hermit-abi"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
[[package]]
name = "iana-time-zone"
......@@ -594,20 +584,19 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
"hermit-abi 0.3.1",
"hermit-abi 0.3.2",
"libc",
"windows-sys",
]
[[package]]
name = "is-terminal"
version = "0.4.7"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix",
"hermit-abi 0.3.2",
"rustix 0.38.3",
"windows-sys",
]
......@@ -622,9 +611,9 @@ dependencies = [
[[package]]
name = "itoa"
version = "1.0.6"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a"
[[package]]
name = "java-properties"
......@@ -689,6 +678,12 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
[[package]]
name = "log"
version = "0.4.19"
......@@ -736,9 +731,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "miniz_oxide"
version = "0.6.2"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
dependencies = [
"adler",
]
......@@ -814,19 +809,19 @@ dependencies = [
[[package]]
name = "num_cpus"
version = "1.15.0"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi 0.2.6",
"hermit-abi 0.3.2",
"libc",
]
[[package]]
name = "object"
version = "0.30.4"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03b4680b86d9cfafba8fc491dc9b6df26b68cf40e9e6cd73909194759a63c385"
checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
dependencies = [
"memchr",
]
......@@ -873,28 +868,28 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "prettyplease"
version = "0.2.9"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9825a04601d60621feed79c4e6b56d65db77cdca55cef43b46b0de1096d1c282"
checksum = "92139198957b410250d43fad93e630d956499a625c527eda65175c8680f83387"
dependencies = [
"proc-macro2",
"syn 2.0.18",
"syn 2.0.23",
]
[[package]]
name = "proc-macro2"
version = "1.0.60"
version = "1.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.28"
version = "1.0.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105"
dependencies = [
"proc-macro2",
]
......@@ -968,9 +963,21 @@ dependencies = [
[[package]]
name = "regex"
version = "1.8.4"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56"
dependencies = [
"aho-corasick",
"memchr",
......@@ -979,9 +986,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.7.2"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
[[package]]
name = "rustc-demangle"
......@@ -997,15 +1004,28 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.37.20"
version = "0.37.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0"
checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06"
dependencies = [
"bitflags 1.3.2",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"linux-raw-sys 0.3.8",
"windows-sys",
]
[[package]]
name = "rustix"
version = "0.38.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
dependencies = [
"bitflags 2.3.3",
"errno",
"libc",
"linux-raw-sys 0.4.3",
"windows-sys",
]
......@@ -1026,9 +1046,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "serde"
version = "1.0.164"
version = "1.0.167"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
checksum = "7daf513456463b42aa1d94cff7e0c24d682b429f020b9afa4f5ba5c40a22b237"
[[package]]
name = "shlex"
......@@ -1070,10 +1090,10 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "sux"
version = "0.1.0"
source = "git+https://github.com/vigna/sux-rs#25fbdf42024b6cbe98741bd0d8135f3188293677"
source = "git+https://github.com/vigna/sux-rs#9c47cceefb8e6b45118c99fd72a4cf040eb14ad3"
dependencies = [
"anyhow",
"bitflags 2.3.2",
"bitflags 2.3.3",
"bytemuck",
"mmap-rs",
"stable_deref_trait",
......@@ -1086,6 +1106,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"bitvec",
"clap",
"dsi-progress-logger",
"libc",
"log",
......@@ -1109,9 +1130,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.18"
version = "2.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737"
dependencies = [
"proc-macro2",
"quote",
......@@ -1134,9 +1155,9 @@ dependencies = [
[[package]]
name = "sysinfo"
version = "0.29.2"
version = "0.29.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9557d0845b86eea8182f7b10dff120214fb6cd9fd937b6f4917714e546a38695"
checksum = "751e810399bba86e9326f5762b7f32ac5a085542df78da6a78d94e07d14d7c11"
dependencies = [
"cfg-if",
"core-foundation-sys",
......@@ -1163,7 +1184,7 @@ dependencies = [
"cfg-if",
"fastrand",
"redox_syscall",
"rustix",
"rustix 0.37.23",
"windows-sys",
]
......@@ -1178,22 +1199,22 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.40"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
checksum = "a35fc5b8971143ca348fa6df4f024d4d55264f3468c71ad1c2f365b0a4d58c42"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.40"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.18",
"syn 2.0.23",
]
[[package]]
......@@ -1208,9 +1229,9 @@ dependencies = [
[[package]]
name = "unicode-ident"
version = "1.0.9"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
[[package]]
name = "utf8parse"
......@@ -1255,7 +1276,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.18",
"syn 2.0.23",
"wasm-bindgen-shared",
]
......@@ -1277,7 +1298,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.18",
"syn 2.0.23",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
......@@ -1291,7 +1312,7 @@ checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
[[package]]
name = "webgraph"
version = "0.1.0"
source = "git+https://github.com/vigna/webgraph-rs#319a8a30fcf7acf42c052e593cb72b6d8f0f2f5f"
source = "git+https://github.com/vigna/webgraph-rs#c0a89d22add27a2ae81bf8a7f024ee13422067d6"
dependencies = [
"anyhow",
"bindgen",
......@@ -1376,7 +1397,7 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
dependencies = [
"windows-targets 0.48.0",
"windows-targets 0.48.1",
]
[[package]]
......@@ -1385,7 +1406,7 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.0",
"windows-targets 0.48.1",
]
[[package]]
......@@ -1405,9 +1426,9 @@ dependencies = [
[[package]]
name = "windows-targets"
version = "0.48.0"
version = "0.48.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
dependencies = [
"windows_aarch64_gnullvm 0.48.0",
"windows_aarch64_msvc 0.48.0",
......
......@@ -12,16 +12,17 @@ categories = ["compression", "graph"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.71"
anyhow = {version="1.0.71", features=["backtrace"]}
mmap-rs = "0.5.0"
sux = {git = "https://github.com/vigna/sux-rs"}
webgraph = {git = "https://github.com/vigna/webgraph-rs"}
log = "0.4.17"
libc = "0.2.147"
[dev-dependencies]
stderrlog = "0.5.4"
dsi-progress-logger = "0.1.0"
clap = { version = "4.1.6", features = ["derive"] }
[dev-dependencies]
bitvec = { version = "1.0.1", features = ["atomic"] }
zstd = "0.12"
......
use anyhow::{Context, Result};
use clap::Parser;
use dsi_progress_logger::ProgressLogger;
use log::info;
use swh_graph::map::{Node2SWHID, Node2Type};
#[derive(Parser, Debug)]
#[command(about = "Build `.node2type.bin` from `.node2swhid.bin`. Example usage: cargo run --bin node2type -- swh/graph/example_dataset/compressed/example ", long_about = None)]
struct Args {
/// The basename of the graph.
basename: String,
/// The basename of the file to create. By default it's the same as
/// `basename`.
dst_basename: Option<String>,
}
pub fn main() -> Result<()> {
let args = Args::parse();
stderrlog::new()
.verbosity(2)
.timestamp(stderrlog::Timestamp::Second)
.init()
.with_context(|| "While Initializing the stderrlog")?;
// load the node ID -> SWHID map so we can convert it to a node2file
let node2swhid_path = format!("{}.node2swhid.bin", args.basename);
info!("loading node ID -> SWHID map from {node2swhid_path} ...");
let node2swhid = Node2SWHID::load(&node2swhid_path).with_context(|| {
format!(
"While loading the .node2swhid.bin file: {}",
node2swhid_path
)
})?;
let num_nodes = node2swhid.len();
// compute the path of the file we are creating
let node2type_path = format!(
"{}.node2type.bin",
args.dst_basename.unwrap_or(args.basename)
);
// create a new node2type file that can index `num_nodes` nodes
let mut node2type = Node2Type::new(&node2type_path, num_nodes as u64)
.with_context(|| format!("While creating the .node2type.bin file: {}", node2type_path))?;
// init the progress logger
let mut pl = ProgressLogger::default().display_memory();
pl.item_name = "node";
pl.local_speed = true;
pl.expected_updates = Some(num_nodes);
pl.start("iterating over node ID -> SWHID map ...");
// build the file
for node_id in 0..num_nodes {
// get the SWHID of the node to get the type and write it inside node2type
node2type.set(node_id, node2swhid.get(node_id).unwrap().node_type);
pl.light_update();
}
pl.done();
Ok(())
}
......@@ -9,5 +9,8 @@
mod node2swhid;
pub use node2swhid::Node2SWHID;
mod node2type;
pub use node2type::Node2Type;
mod order;
pub use order::Order;
......@@ -34,6 +34,22 @@ impl Node2SWHID {
impl Node2SWHID {
/// Convert a node_it to a SWHID
///
/// # Safety
/// This function is unsafe because it does not check that `node_id` is
/// within bounds of the array if debug asserts are disabled
#[inline]
pub unsafe fn get_unchecked(&self, node_id: usize) -> SWHID {
let offset = node_id * SWHID::BYTES_SIZE;
let bytes = self.data.get_unchecked(offset..offset + SWHID::BYTES_SIZE);
// this unwrap is always safe because we use the same const
let bytes: [u8; SWHID::BYTES_SIZE] = bytes.try_into().unwrap();
// this unwrap can only fail on a corrupted file, so it's ok to panic
SWHID::try_from(bytes).unwrap()
}
/// Convert a node_it to a SWHID
#[inline]
pub fn get(&self, node_id: usize) -> Option<SWHID> {
let offset = node_id * SWHID::BYTES_SIZE;
let bytes = self.data.get(offset..offset + SWHID::BYTES_SIZE)?;
......@@ -45,6 +61,7 @@ impl Node2SWHID {
/// Return how many node_ids are in this map
#[allow(clippy::len_without_is_empty)] // rationale: we don't care about empty maps
#[inline]
pub fn len(&self) -> usize {
self.data.len() / SWHID::BYTES_SIZE
}
......
use crate::SWHType;
use anyhow::{Context, Result};
use log::info;
use mmap_rs::{Mmap, MmapMut};
use std::path::Path;
use sux::prelude::CompactArray;
use sux::traits::*;
/// Struct to create and load a `.node2type.bin` file and convert node ids to types.
pub struct Node2Type<B: VSlice> {
data: CompactArray<B>,
}
impl<B: VSlice> Node2Type<B> {
#[inline]
/// Get the type of a node with id `node_id` without bounds checking
///
/// # Safety
/// This function is unsafe because it does not check that `node_id` is
/// within bounds of the array if debug asserts are disabled
pub unsafe fn get_unchecked(&self, node_id: usize) -> SWHType {
SWHType::try_from(self.data.get_unchecked(node_id) as u8).unwrap()
}
#[inline]
/// Get the type of a node with id `node_id`
pub fn get(&self, node_id: usize) -> Option<SWHType> {
SWHType::try_from(self.data.get(node_id) as u8).ok()
}
}
impl<B: VSliceMut> Node2Type<B> {
#[inline]
/// Get the type of a node with id `node_id` without bounds checking
///
/// # Safety
/// This function is unsafe because it does not check that `node_id` is
/// within bounds of the array if debug asserts are disabled
pub unsafe fn set_unchecked(&mut self, node_id: usize, node_type: SWHType) {
self.data.set_unchecked(node_id, node_type as u64);
}
#[inline]
/// Set the type of a node with id `node_id`
pub fn set(&mut self, node_id: usize, node_type: SWHType) {
self.data.set(node_id, node_type as u64);
}
}
impl Node2Type<MmapMut> {
/// Create a new `.node2type.bin` file
pub fn new<P: AsRef<Path>>(path: P, num_nodes: u64) -> Result<Self> {
let path = path.as_ref();
// compute the size of the file we are creating in bytes
let mut file_len = (num_nodes * SWHType::BITWIDTH as u64 + 7) / 8;
// make the file dimension a multiple of 8 bytes so CompactArray can
// read u64 words from it
file_len += 8 - (file_len % 8);
info!("The resulting file will be {} bytes long.", file_len);
// create the file
let node2type_file = std::fs::File::options()
.read(true)
.write(true)
.create(true)
.open(path)
.with_context(|| {
format!(
"While creating the .node2type.bin file: {}",
path.to_string_lossy()
)
})?;
// fallocate the file with zeros so we can fill it without ever resizing it
node2type_file
.set_len(file_len)
.with_context(|| "While fallocating the file with zeros")?;
// create a mutable mmap to the file so we can directly write it in place
let mmap = unsafe {
mmap_rs::MmapOptions::new(file_len as _)?
.with_file(node2type_file, 0)
.map_mut()
.with_context(|| "While mmapping the file")?
};
// use the CompactArray over the mmap
let node2type =
unsafe { CompactArray::from_raw_parts(mmap, SWHType::BITWIDTH, num_nodes as usize) };
Ok(Self { data: node2type })
}
/// Load a mutable `.node2type.bin` file
pub fn load_mut<P: AsRef<Path>>(path: P, num_nodes: u64) -> Result<Self> {
let path = path.as_ref();
let file_len = path.metadata()?.len();
let file = std::fs::File::open(path)?;
let data = unsafe {
mmap_rs::MmapOptions::new(file_len as _)?
.with_flags((sux::prelude::Flags::TRANSPARENT_HUGE_PAGES).mmap_flags())
.with_file(file, 0)
.map_mut()?
};
#[cfg(target_os = "linux")]
unsafe {
libc::madvise(data.as_ptr() as *mut _, data.len(), libc::MADV_RANDOM)
};
// use the CompactArray over the mmap
let node2type =
unsafe { CompactArray::from_raw_parts(data, SWHType::BITWIDTH, num_nodes as usize) };
Ok(Self { data: node2type })
}
}
impl Node2Type<Mmap> {
/// Load a read-only `.node2type.bin` file
pub fn load<P: AsRef<Path>>(path: P, num_nodes: u64) -> Result<Self> {
let path = path.as_ref();
let file_len = path.metadata()?.len();
let file = std::fs::File::open(path)?;
let data = unsafe {
mmap_rs::MmapOptions::new(file_len as _)?
.with_flags((sux::prelude::Flags::TRANSPARENT_HUGE_PAGES).mmap_flags())
.with_file(file, 0)
.map()?
};
#[cfg(target_os = "linux")]
unsafe {
libc::madvise(data.as_ptr() as *mut _, data.len(), libc::MADV_RANDOM)
};
// use the CompactArray over the mmap
let node2type =
unsafe { CompactArray::from_raw_parts(data, SWHType::BITWIDTH, num_nodes as usize) };
Ok(Self { data: node2type })
}
}
......@@ -83,6 +83,21 @@ impl TryFrom<u8> for SWHType {
}
impl SWHType {
/// Get the number of possible types.
///
/// To avoid having to update this when adding a new type
/// we can use the unstable function `std::mem::variant_count`
/// or the `variant_count` crate.
/// But for now we just hardcode it while we decide how to
/// deal with this.
pub const NUMBER_OF_TYPES: usize = 6;
/// The number of bits needed to store the node type as integers
/// This is `ceil(log2(NUMBER_OF_TYPES))` which can be arithmetized into
/// `floor(log2(NUMBER_OF_TYPES))` plus one if it's not a power of two.
pub const BITWIDTH: usize = Self::NUMBER_OF_TYPES.ilog2() as usize
+ (!Self::NUMBER_OF_TYPES.is_power_of_two()) as usize;
/// Convert a type to the str used in the SWHID
pub fn to_str(&self) -> &'static str {
match self {
......
use anyhow::{Context, Result};
use log::info;
use swh_graph::map::{Node2SWHID, Node2Type};
const BASENAME: &str = "../swh/graph/example_dataset/compressed/example";
#[test]
fn test_node2type() -> Result<()> {
// load the node ID -> SWHID map so we can convert it to a node2file
let node2swhid_path = format!("{}.node2swhid.bin", BASENAME);
info!("loading node ID -> SWHID map from {node2swhid_path} ...");
let node2swhid = Node2SWHID::load(&node2swhid_path).with_context(|| {
format!(
"While loading the .node2swhid.bin file: {}",
node2swhid_path
)
})?;
let num_nodes = node2swhid.len();
// load the node2type file
let node2type_path = format!("{}.node2type.bin", BASENAME);
info!("loading node ID -> type map from {node2type_path} ...");
let node2type = Node2Type::load(&node2type_path, num_nodes as u64)
.with_context(|| format!("While loading the .node2type.bin file: {}", node2type_path))?;
// check that the the node2type matches with the node2swhid
for node_id in 0..num_nodes {
assert_eq!(
node2swhid.get(node_id).unwrap().node_type,
node2type.get(node_id).unwrap()
)
}
Ok(())
}
......@@ -13,7 +13,6 @@ use webgraph::prelude::*;
const BASENAME: &str = "../swh/graph/example_dataset/compressed/example";
#[test]
#[allow(dead_code)]
fn test_order_mph() -> Result<()> {
// Setup a stderr logger because ProgressLogger uses the `log` crate
// to printout
......
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment