From bc52fcf32f04efa434ac37960811a1981fc1afcb Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 6 Dec 2024 19:10:57 +0000 Subject: [PATCH 01/29] Docs: Extend to StringWa.rs --- README.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index acff49e..6f51822 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,23 @@ -# [`memchr`](https://github.com/BurntSushi/memchr) vs [`stringzilla`](https://github.com/ashvardanian/StringZilla) +# StringWa.rs -## Rust Substring Search Benchmarks +![StringWa.rs Thumbnail](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/StringWa.rs.jpg?raw=true) + +_Not to pick a fight, but let there be String Wars!_ 😅 +Jokes aside, many __great__ libraries for string processing exist. +_Mostly, of course, written in C and C++, but some in Rust as well._ 😅 + +Where Rust decimates C and C++, however, is dependency management, making it perfect for comparing different systems-level projects to each other! +So, to accelerate the development of the [`stringzilla`](https://github.com/ashvardanian/StringZilla) C library, I've created this repository to compare it against: + +- [`memchr`](https://github.com/BurntSushi/memchr) for substring search. +- [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances. +- [`aHash`](https://github.com/tkaitchuck/aHash) for hashing. +- [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search. + +Of course, the functionality of the projects is different, as are the APIs and the usage patterns. +So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations. + +## Substring Search Benchmarks Substring search is one of the most common operations in text processing, and one of the slowest. StringZilla was designed to supersede LibC and implement those core operations in CPU-friendly manner, using branchless operations, SWAR, and SIMD assembly instructions. @@ -31,7 +48,7 @@ Before running benchmarks, you can test your Rust environment running: ```bash cargo install cargo-criterion --locked -HAYSTACK_PATH=README.md cargo criterion --jobs 8 +HAYSTACK_PATH=README.md cargo criterion bench_find --jobs 8 ``` On Windows using PowerShell you'd need to set the environment variable differently: @@ -47,6 +64,8 @@ All inclusions of that token in the haystack are counted, and the throughput is This generally results in very stable and predictable results. The benchmark also includes a warm-up, to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. +## Datasets + ### ASCII Corpus For benchmarks on ASCII data I've used the English Leipzig Corpora Collection. From c842fe918a26ea4d7c72cac1ffa775a6d867649f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 6 Dec 2024 19:24:13 +0000 Subject: [PATCH 02/29] Add: Levenshtein benchmarks --- .vscode/settings.json | 9 ++++ Cargo.lock | 27 +++++++----- Cargo.toml | 28 ++++++++++-- README.md | 43 ++++++++++++++----- bench.rs => bench_find.rs | 17 ++++---- bench_levenshtein.rs | 89 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 33 deletions(-) create mode 100644 .vscode/settings.json rename bench.rs => bench_find.rs (92%) create mode 100644 bench_levenshtein.rs diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b810795 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "cSpell.words": [ + "stringwars", + "memchr", + "memmem", + "rfind", + "stringzilla" + ] +} diff --git a/Cargo.lock b/Cargo.lock index 5b5ae31..edb5018 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,16 +258,6 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" -[[package]] -name = "memchr_vs_stringzilla" -version = "0.1.0" -dependencies = [ - "criterion", - "memchr", - "rand", - "stringzilla", -] - [[package]] name = "num-traits" version = "0.2.18" @@ -371,6 +361,12 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rapidfuzz" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270e04e5ea61d40841942bb15e451c29ee1618637bcf97fc7ede5dd4a9b1601b" + [[package]] name = "rayon" version = "1.8.1" @@ -466,6 +462,17 @@ dependencies = [ "serde", ] +[[package]] +name = "stringwars" +version = "0.1.0" +dependencies = [ + "criterion", + "memchr", + "rand", + "rapidfuzz", + "stringzilla", +] + [[package]] name = "stringzilla" version = "3.3.0" diff --git a/Cargo.toml b/Cargo.toml index e37ce43..7ddbbe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,35 @@ [package] -name = "memchr_vs_stringzilla" +name = "stringwars" version = "0.1.0" edition = "2018" [dependencies] rand = "0.8.5" criterion = "0.5.1" -memchr = { version = "2.7.1", default-features = false } stringzilla = { version = "3.3.0" } +# Feature-based dependencies for benchmarks +[features] +bench_find = ["memchr"] +bench_levenshtein = ["rapidfuzz"] + +[dependencies.memchr] +version = "2.7.1" +default-features = false +optional = true + +[dependencies.rapidfuzz] +version = "0.5.0" +optional = true + +[[bench]] +name = "bench_find" +path = "bench_find.rs" +harness = false +required-features = ["bench_find"] + [[bench]] -name = "bench" +name = "bench_levenshtein" +path = "bench_levenshtein.rs" harness = false -path = "bench.rs" +required-features = ["bench_levenshtein"] diff --git a/README.md b/README.md index 6f51822..fe2d87d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ _Not to pick a fight, but let there be String Wars!_ 😅 Jokes aside, many __great__ libraries for string processing exist. _Mostly, of course, written in C and C++, but some in Rust as well._ 😅 -Where Rust decimates C and C++, however, is dependency management, making it perfect for comparing different systems-level projects to each other! +Where Rust decimates C and C++, however, is the __simplicity__ of dependency management, making it great for benchmarking low-level software! So, to accelerate the development of the [`stringzilla`](https://github.com/ashvardanian/StringZilla) C library, I've created this repository to compare it against: - [`memchr`](https://github.com/BurntSushi/memchr) for substring search. @@ -48,22 +48,43 @@ Before running benchmarks, you can test your Rust environment running: ```bash cargo install cargo-criterion --locked -HAYSTACK_PATH=README.md cargo criterion bench_find --jobs 8 ``` +Each benchmark includes a warm-up, to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. +To run them on Linux and MacOS, pass the dataset path as an environment variable: + +- Substring Search: + + ```bash + STRINGWARS_DATASET=README.md cargo criterion --features bench_find bench_find --jobs 8 + ``` + + As part of the benchmark, the input "haystack" file is whitespace-tokenized into an array of strings. + In every benchmark iteration, a new "needle" is taken from that array of tokens. + All inclusions of that token in the haystack are counted, and the throughput is calculated. + +- Edit Distance: + + ```bash + STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + ``` + +- Hashing: + + ```bash + STRINGWARS_MODE=file STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + ``` + On Windows using PowerShell you'd need to set the environment variable differently: ```powershell -$env:HAYSTACK_PATH="README.md" +$env:STRINGWARS_DATASET="README.md" cargo criterion --jobs 8 ``` -As part of the benchmark, the input "haystack" file is whitespace-tokenized into an array of strings. -In every benchmark iteration, a new "needle" is taken from that array of tokens. -All inclusions of that token in the haystack are counted, and the throughput is calculated. -This generally results in very stable and predictable results. -The benchmark also includes a warm-up, to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. - ## Datasets ### ASCII Corpus @@ -73,7 +94,7 @@ It's 124 MB in size, 1'000'000 lines long, and contains 8'388'608 tokens of mean ```bash wget --no-clobber -O leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt -HAYSTACK_PATH=leipzig1M.txt cargo criterion --jobs 8 +STRINGWARS_DATASET=leipzig1M.txt cargo criterion --jobs 8 ``` ### UTF8 Corpus @@ -85,5 +106,5 @@ To download, unpack, and run the benchmarks, execute the following bash script i ```bash wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz gzip -d xlsum.csv.gz -HAYSTACK_PATH=xlsum.csv cargo criterion --jobs 8 +STRINGWARS_DATASET=xlsum.csv cargo criterion --jobs 8 ``` diff --git a/bench.rs b/bench_find.rs similarity index 92% rename from bench.rs rename to bench_find.rs index 53ec131..3de9c5c 100644 --- a/bench.rs +++ b/bench_find.rs @@ -1,7 +1,8 @@ -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use std::env; use std::fs; +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; + use memchr::memmem; use stringzilla::StringZilla; @@ -12,11 +13,11 @@ fn configure_bench() -> Criterion { .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. } -fn benchmarks(c: &mut Criterion) { +fn bench_find(c: &mut Criterion) { // Get the haystack path from the environment variable. - let haystack_path = - env::var("HAYSTACK_PATH").expect("HAYSTACK_PATH environment variable not set"); - let haystack_content = fs::read_to_string(&haystack_path).expect("Could not read haystack"); + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack"); // Tokenize the haystack content by white space. let needles: Vec<&str> = haystack_content.split_whitespace().collect(); @@ -117,8 +118,8 @@ fn perform_reverse_benchmarks( } criterion_group! { - name = sz_bench; + name = bench_find_group; config = configure_bench(); - targets = benchmarks + targets = bench_find } -criterion_main!(sz_bench); +criterion_main!(bench_find_group); diff --git a/bench_levenshtein.rs b/bench_levenshtein.rs new file mode 100644 index 0000000..038d63b --- /dev/null +++ b/bench_levenshtein.rs @@ -0,0 +1,89 @@ +use std::env; +use std::fs; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; + +use rapidfuzz::distance::levenshtein; +use stringzilla::StringZilla; + +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(1000) // Number of iterations for each benchmark. + .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. +} + +fn bench_levenshtein(c: &mut Criterion) { + // Get the dataset path from the environment variable. + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); + + // Depending on the mode, split the input differently. + let units: Vec<&str> = match mode.as_str() { + "words" => content.split_whitespace().collect(), + "lines" => content.lines().collect(), + other => panic!( + "Unknown STRINGWARS_MODE: {}. Use 'lines' or 'words'.", + other + ), + }; + + if units.len() < 2 { + panic!("Dataset must contain at least two items for comparisons."); + } + + // Pair up the units in twos. + let pairs: Vec<(&str, &str)> = units + .chunks(2) + .filter_map(|chunk| { + if chunk.len() == 2 { + Some((chunk[0], chunk[1])) + } else { + None + } + }) + .collect(); + + let data_size = pairs.len(); + + let mut g = c.benchmark_group("levenshtein"); + g.throughput(Throughput::Elements(data_size as u64)); + + perform_levenshtein_benchmarks(&mut g, &pairs); + + g.finish(); +} + +fn perform_levenshtein_benchmarks( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + pairs: &[(&str, &str)], +) { + // Benchmark for StringZilla Levenshtein distance + let mut pair_index: usize = 0; + g.bench_function("stringzilla::levenshtein", |b| { + b.iter(|| { + let (a, b) = pairs[pair_index]; + let _distance = a.sz_edit_distance(b); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + + // Benchmark for RapidFuzz Levenshtein distance + let mut pair_index: usize = 0; + g.bench_function("rapidfuzz::levenshtein", |b| { + b.iter(|| { + let (a, b) = pairs[pair_index]; + let _distance = levenshtein::distance(a.chars(), b.chars()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); +} + +criterion_group! { + name = bench_levenshtein_group; + config = configure_bench(); + targets = bench_levenshtein +} +criterion_main!(bench_levenshtein_group); From b906b91065a8c4feca6c4f1c7b4b7f020d28ef9b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 6 Dec 2024 21:10:39 +0000 Subject: [PATCH 03/29] Add: Hashing benchmarks --- .vscode/settings.json | 3 +- Cargo.lock | 51 ++++++++++++- Cargo.toml | 19 ++++- README.md | 8 +- bench_hash.rs | 110 ++++++++++++++++++++++++++ bench_levenshtein.rs | 174 ++++++++++++++++++++++++++++++++++-------- 6 files changed, 327 insertions(+), 38 deletions(-) create mode 100644 bench_hash.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index b810795..6cd23c3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,9 +1,10 @@ { "cSpell.words": [ - "stringwars", "memchr", "memmem", + "rapidfuzz", "rfind", + "stringwars", "stringzilla" ] } diff --git a/Cargo.lock b/Cargo.lock index edb5018..54017f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.2" @@ -466,18 +479,18 @@ dependencies = [ name = "stringwars" version = "0.1.0" dependencies = [ + "ahash", "criterion", "memchr", "rand", "rapidfuzz", "stringzilla", + "xxhash-rust", ] [[package]] name = "stringzilla" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a7521ac3427b9007b364cb7b412d5d2634c6a1108d95b73dd55d7341379df2" +version = "3.11.0" dependencies = [ "cc", ] @@ -509,6 +522,12 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.4.0" @@ -685,3 +704,29 @@ name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "xxhash-rust" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 7ddbbe8..00f23ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,12 +6,14 @@ edition = "2018" [dependencies] rand = "0.8.5" criterion = "0.5.1" -stringzilla = { version = "3.3.0" } +# stringzilla = { version = "3.3.0" } +stringzilla = { path = "../StringZilla-dev" } # Feature-based dependencies for benchmarks [features] bench_find = ["memchr"] bench_levenshtein = ["rapidfuzz"] +bench_hash = ["ahash", "xxhash-rust"] [dependencies.memchr] version = "2.7.1" @@ -22,6 +24,15 @@ optional = true version = "0.5.0" optional = true +[dependencies.ahash] +version = "0.8" +optional = true + +[dependencies.xxhash-rust] +version = "0.8" +optional = true +features = ["xxh3", "const_xxh3"] + [[bench]] name = "bench_find" path = "bench_find.rs" @@ -33,3 +44,9 @@ name = "bench_levenshtein" path = "bench_levenshtein.rs" harness = false required-features = ["bench_levenshtein"] + +[[bench]] +name = "bench_hash" +path = "bench_hash.rs" +harness = false +required-features = ["bench_hash"] diff --git a/README.md b/README.md index fe2d87d..6671f51 100644 --- a/README.md +++ b/README.md @@ -66,10 +66,14 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable - Edit Distance: ```bash - STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 - STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + STRINGWARS_MODE=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + STRINGWARS_MODE=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 ``` + Edit distance benchmarks compute the Levenshtein distance between consecutive pairs of whitespace-delimited words or newline-delimited lines. + They include byte-level and character-level operations and also run for the bounded case - when the maximum allowed distance is predefined. + By default, the maximum allowed distance is set to 15% of the longer string in each pair. + - Hashing: ```bash diff --git a/bench_hash.rs b/bench_hash.rs new file mode 100644 index 0000000..2f95de7 --- /dev/null +++ b/bench_hash.rs @@ -0,0 +1,110 @@ +use std::env; +use std::fs; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use std::hash::{BuildHasher, Hasher}; + +use stringzilla::sz::{checksum as sz_checksum, hash as sz_hash}; +use stringzilla::StringZilla; + +use ahash::AHasher; +use xxhash_rust::const_xxh3::xxh3_64 as const_xxh3; +use xxhash_rust::xxh3::xxh3_64; + +// Mode: "lines", "words", "file" +// STRINGWARS_MODE controls how we interpret the input data. +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(1000) // Number of iterations per benchmark. + .warm_up_time(std::time::Duration::from_secs(10)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. +} + +fn bench_hash(c: &mut Criterion) { + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + + let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); + let units: Vec<&str> = match mode.as_str() { + "lines" => content.lines().collect(), + "words" => content.split_whitespace().collect(), + "file" => { + // In "file" mode, treat the entire content as a single unit. + vec![&content] + } + other => panic!( + "Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.", + other + ), + }; + + if units.is_empty() { + panic!("No data found for hashing in the provided dataset."); + } + + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = units.iter().map(|u| u.len()).sum(); + + let mut g = c.benchmark_group("hash"); + g.throughput(Throughput::Bytes(total_bytes as u64)); + + perform_hashing_benchmarks(&mut g, &units); + + g.finish(); +} + +fn perform_hashing_benchmarks( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + units: &[&str], +) { + // Benchmark StringZilla checksums + let mut index = 0; + g.bench_function("stringzilla::checksum", |b| { + b.iter(|| { + let unit = units[index]; + let _hash = sz_checksum(unit.as_bytes()); + index = (index + 1) % units.len(); + }) + }); + + // Benchmark StringZilla hashing + let mut index = 0; + g.bench_function("stringzilla::hash", |b| { + b.iter(|| { + let unit = units[index]; + let _hash = sz_hash(unit.as_bytes()); + index = (index + 1) % units.len(); + }) + }); + + // Benchmark aHash + let mut index = 0; + let ahash_builder = ahash::RandomState::new(); + g.bench_function("aHash", |b| { + b.iter(|| { + let unit = units[index]; + let mut hasher = ahash_builder.build_hasher(); + hasher.write(unit.as_bytes()); + let _hash = hasher.finish(); + index = (index + 1) % units.len(); + }) + }); + + // Benchmark xxHash (xxh3) + let mut index = 0; + g.bench_function("xxh3", |b| { + b.iter(|| { + let unit = units[index]; + let _hash = xxh3_64(unit.as_bytes()); + index = (index + 1) % units.len(); + }) + }); +} + +criterion_group! { + name = bench_hash_group; + config = configure_bench(); + targets = bench_hash +} +criterion_main!(bench_hash_group); diff --git a/bench_levenshtein.rs b/bench_levenshtein.rs index 038d63b..f997858 100644 --- a/bench_levenshtein.rs +++ b/bench_levenshtein.rs @@ -1,26 +1,33 @@ use std::env; use std::fs; -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; - +use criterion::{criterion_group, criterion_main, Criterion}; use rapidfuzz::distance::levenshtein; use stringzilla::StringZilla; fn configure_bench() -> Criterion { Criterion::default() - .sample_size(1000) // Number of iterations for each benchmark. - .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. - .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. + .sample_size(1000) + .warm_up_time(std::time::Duration::from_secs(10)) + .measurement_time(std::time::Duration::from_secs(120)) } fn bench_levenshtein(c: &mut Criterion) { - // Get the dataset path from the environment variable. let dataset_path = env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); - // Depending on the mode, split the input differently. + let bound_percent = env::var("STRINGWARS_ERROR_BOUND") + .unwrap_or_else(|_| "15".to_string()) + .parse::() + .expect("STRINGWARS_ERROR_BOUND must be a number"); + + let max_pairs = env::var("STRINGWARS_MAX_PAIRS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(100); + let units: Vec<&str> = match mode.as_str() { "words" => content.split_whitespace().collect(), "lines" => content.lines().collect(), @@ -34,8 +41,7 @@ fn bench_levenshtein(c: &mut Criterion) { panic!("Dataset must contain at least two items for comparisons."); } - // Pair up the units in twos. - let pairs: Vec<(&str, &str)> = units + let mut pairs: Vec<(&str, &str)> = units .chunks(2) .filter_map(|chunk| { if chunk.len() == 2 { @@ -46,12 +52,25 @@ fn bench_levenshtein(c: &mut Criterion) { }) .collect(); - let data_size = pairs.len(); + if pairs.is_empty() { + panic!("No pairs could be formed from the dataset."); + } + + if pairs.len() > max_pairs { + pairs.truncate(max_pairs); + } + + let pair_bounds: Vec = pairs + .iter() + .map(|(a, b)| { + let max_len = a.len().max(b.len()); + ((max_len as u64 * bound_percent) / 100) as usize + }) + .collect(); let mut g = c.benchmark_group("levenshtein"); - g.throughput(Throughput::Elements(data_size as u64)); - perform_levenshtein_benchmarks(&mut g, &pairs); + perform_levenshtein_benchmarks(&mut g, &pairs, &pair_bounds); g.finish(); } @@ -59,26 +78,119 @@ fn bench_levenshtein(c: &mut Criterion) { fn perform_levenshtein_benchmarks( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, pairs: &[(&str, &str)], + pair_bounds: &[usize], ) { - // Benchmark for StringZilla Levenshtein distance - let mut pair_index: usize = 0; - g.bench_function("stringzilla::levenshtein", |b| { - b.iter(|| { - let (a, b) = pairs[pair_index]; - let _distance = a.sz_edit_distance(b); - pair_index = (pair_index + 1) % pairs.len(); - }) - }); - - // Benchmark for RapidFuzz Levenshtein distance - let mut pair_index: usize = 0; - g.bench_function("rapidfuzz::levenshtein", |b| { - b.iter(|| { - let (a, b) = pairs[pair_index]; - let _distance = levenshtein::distance(a.chars(), b.chars()); - pair_index = (pair_index + 1) % pairs.len(); - }) - }); + // StringZilla, bytes-based, unbounded + { + let mut pair_index = 0; + g.bench_function("stringzilla::levenshtein_bytes_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = a.sz_edit_distance(b_str.as_bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, bytes-based, bounded + { + let mut pair_index = 0; + g.bench_function("stringzilla::levenshtein_bytes_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = a + .as_bytes() + .sz_edit_distance_bounded(b_str.as_bytes(), bound); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, UTF-8, unbounded + { + let mut pair_index = 0; + g.bench_function("stringzilla::levenshtein_utf8_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = a.as_bytes().sz_edit_distance_utf8(b_str.as_bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, UTF-8, bounded + { + let mut pair_index = 0; + g.bench_function("stringzilla::levenshtein_utf8_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = a + .as_bytes() + .sz_edit_distance_utf8_bounded(b_str.as_bytes(), bound); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, ASCII (bytes) unbounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_bytes_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = levenshtein::distance(a.bytes(), b_str.bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, ASCII (bytes) bounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_bytes_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = levenshtein::distance_with_args( + a.bytes(), + b_str.bytes(), + &levenshtein::Args::default().score_cutoff(bound), + ); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, UTF-8 (chars) unbounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_utf8_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = levenshtein::distance(a.chars(), b_str.chars()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, UTF-8 (chars) bounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_utf8_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = levenshtein::distance_with_args( + a.chars(), + b_str.chars(), + &levenshtein::Args::default().score_cutoff(bound), + ); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } } criterion_group! { From bd23a21d960a344f512e7cb0d902647951a84ff1 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 8 Dec 2024 20:22:19 +0000 Subject: [PATCH 04/29] Add: Placeholder for TF-IDF https://github.com/ashvardanian/SimSIMD/pull/239 --- .vscode/settings.json | 3 ++- README.md | 10 +++++++++ bench_tfidf.rs | 51 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 bench_tfidf.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 6cd23c3..acb1fc9 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,7 @@ "rapidfuzz", "rfind", "stringwars", - "stringzilla" + "stringzilla", + "tfidf" ] } diff --git a/README.md b/README.md index 6671f51..c259f79 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv - [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances. - [`aHash`](https://github.com/tkaitchuck/aHash) for hashing. - [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search. +- [`tantivy`](https://github.com/quickwit-oss/tantivy) for document retrieval. Of course, the functionality of the projects is different, as are the APIs and the usage patterns. So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations. @@ -82,6 +83,15 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 ``` +- Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf): + + ```bash + STRINGWARS_DATASET=README.md cargo criterion --features bench_tfidf bench_tfidf --jobs 8 + ``` + + The TF-IDF benchmarks compute the term frequency-inverse document frequency for each word in the input file. + The benchmark relies on a hybrid of StringZilla and SimSIMD to achieve the best performance. + On Windows using PowerShell you'd need to set the environment variable differently: ```powershell diff --git a/bench_tfidf.rs b/bench_tfidf.rs new file mode 100644 index 0000000..bef3368 --- /dev/null +++ b/bench_tfidf.rs @@ -0,0 +1,51 @@ +use std::env; +use std::fs; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; + +use memchr::memmem; +use stringzilla::StringZilla; + +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(1000) // Test this many needles. + .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. +} + +fn bench_tfidf(c: &mut Criterion) { + // Get the haystack path from the environment variable. + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack"); + + // Tokenize the haystack content by white space. + let needles: Vec<&str> = haystack_content.split_whitespace().collect(); + if needles.is_empty() { + panic!("No tokens found in the haystack."); + } + + let haystack = haystack_content.as_bytes(); + let haystack_length = haystack.len(); + + // Benchmarks for forward search + let mut g = c.benchmark_group("search-forward"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_forward_benchmarks(&mut g, &needles, haystack); + g.finish(); + + // Benchmarks for reverse search + let mut g = c.benchmark_group("search-reverse"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_reverse_benchmarks(&mut g, &needles, haystack); + g.finish(); +} + +... + +criterion_group! { + name = bench_tfidf_group; + config = configure_bench(); + targets = bench_tfidf +} +criterion_main!(bench_tfidf_group); From 7a0fc0bea5f9e2ad9baacb1b8c4a98b9d732d3f8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 2 Mar 2025 00:41:48 +0000 Subject: [PATCH 05/29] Improve: More hashing backends --- .vscode/settings.json | 4 +- Cargo.lock | 65 +++++++++++++++- Cargo.toml | 10 ++- bench_hash.rs | 172 ++++++++++++++++++++++++++++++------------ 4 files changed, 196 insertions(+), 55 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index acb1fc9..b57ebfd 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { "cSpell.words": [ + "ahash", + "bytesum", "memchr", "memmem", "rapidfuzz", @@ -8,4 +10,4 @@ "stringzilla", "tfidf" ] -} +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 54017f9..de66ffb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "ahash" @@ -36,12 +36,37 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "blake3" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "675f87afced0413c9bb02843499dbbd3882a237645883f71a2b59644a6d2f753" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "bumpalo" version = "3.15.3" @@ -56,9 +81,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.85" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b918671670962b48bc23753aef0c51d072dca6f52f01f800854ada6ddb7f7d3" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -118,6 +146,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "criterion" version = "0.5.1" @@ -202,6 +236,15 @@ dependencies = [ "wasi", ] +[[package]] +name = "gxhash" +version = "3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" +dependencies = [ + "rustversion", +] + [[package]] name = "half" version = "2.3.1" @@ -429,6 +472,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustversion" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" + [[package]] name = "ryu" version = "1.0.17" @@ -475,12 +524,20 @@ dependencies = [ "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "stringwars" version = "0.1.0" dependencies = [ "ahash", + "blake3", "criterion", + "gxhash", "memchr", "rand", "rapidfuzz", @@ -490,7 +547,7 @@ dependencies = [ [[package]] name = "stringzilla" -version = "3.11.0" +version = "3.11.3" dependencies = [ "cc", ] diff --git a/Cargo.toml b/Cargo.toml index 00f23ea..ff02ba5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ stringzilla = { path = "../StringZilla-dev" } [features] bench_find = ["memchr"] bench_levenshtein = ["rapidfuzz"] -bench_hash = ["ahash", "xxhash-rust"] +bench_hash = ["ahash", "xxhash-rust", "blake3", "gxhash"] [dependencies.memchr] version = "2.7.1" @@ -24,6 +24,14 @@ optional = true version = "0.5.0" optional = true +[dependencies.blake3] +version = "1.6.1" +optional = true + +[dependencies.gxhash] +version = "3.4.1" +optional = true + [dependencies.ahash] version = "0.8" optional = true diff --git a/bench_hash.rs b/bench_hash.rs index 2f95de7..2860903 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -1,23 +1,63 @@ +//! # StringWa.rs Hashing Benchmarks +//! +//! This file contains benchmarks for various Rust hashing libraries using Criterion. +//! +//! The benchmarks compare the performance of different hash functions including: +//! +//! - StringZilla (`bytesum`, `hash`, and incremental `hash` variants) +//! - aHash (both incremental and single-entry variants) +//! - gxhash (gxhash64) +//! - Blake3 (default cryptographic hash) +//! - xxHash (xxh3) through the third-party `xxhash-rust` crate +//! +//! ## Environment Variables +//! +//! The benchmarks use two environment variables to control the input dataset and mode: +//! +//! - `STRINGWARS_DATASET`: Path to the input dataset file. +//! - `STRINGWARS_MODE`: Specifies how to interpret the input. Allowed values: +//! - `lines`: Process the dataset line by line. +//! - `words`: Process the dataset word by word. +//! - `file`: Process the entire file as a single unit. +//! +//! You should also set the `RUSTFLAGS` environment variable to enable the appropriate CPU features. +//! +//! ## Usage Examples +//! +//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: +//! +//! ```sh +//! STRINGWARS_MODE=file STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 +//! STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 +//! STRINGWARS_MODE=words STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 +//! ``` +//! +//! ## Notes +//! +//! - Ensure your CPU supports the required AES and SSE2 instructions when using `gxhash`. +//! - The benchmarks aggregate hashing over the dataset for more realistic throughput measurements. use std::env; use std::fs; -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; -use std::hash::{BuildHasher, Hasher}; - -use stringzilla::sz::{checksum as sz_checksum, hash as sz_hash}; -use stringzilla::StringZilla; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; -use ahash::AHasher; -use xxhash_rust::const_xxh3::xxh3_64 as const_xxh3; +use ahash::RandomState; +use blake3; +use gxhash; +use std::hash::{BuildHasher, Hasher}; +use stringzilla::sz::{ + bytesum as sz_bytesum, // + capabilities as sz_capabilities, + dynamic_dispatch as sz_dynamic_dispatch, + hash as sz_hash, + version as sz_version, +}; use xxhash_rust::xxh3::xxh3_64; -// Mode: "lines", "words", "file" -// STRINGWARS_MODE controls how we interpret the input data. fn configure_bench() -> Criterion { Criterion::default() - .sample_size(1000) // Number of iterations per benchmark. - .warm_up_time(std::time::Duration::from_secs(10)) // Let CPU frequencies settle. - .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. + .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(10)) // Actual measurement time. } fn bench_hash(c: &mut Criterion) { @@ -43,68 +83,102 @@ fn bench_hash(c: &mut Criterion) { panic!("No data found for hashing in the provided dataset."); } - // Calculate total bytes processed for throughput reporting + // Calculate total bytes processed for throughput reporting. let total_bytes: usize = units.iter().map(|u| u.len()).sum(); - let mut g = c.benchmark_group("hash"); g.throughput(Throughput::Bytes(total_bytes as u64)); - perform_hashing_benchmarks(&mut g, &units); - g.finish(); } fn perform_hashing_benchmarks( - g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, units: &[&str], ) { - // Benchmark StringZilla checksums - let mut index = 0; - g.bench_function("stringzilla::checksum", |b| { + // Benchmark: StringZilla bytesum + group.bench_function("stringzilla::bytesum", |b| { + b.iter(|| { + for unit in units { + // Using black_box to prevent compiler optimizations. + let _hash = sz_bytesum(black_box(unit.as_bytes())); + } + }) + }); + + // Benchmark: StringZilla hash + group.bench_function("stringzilla::hash", |b| { + b.iter(|| { + for unit in units { + let _hash = sz_hash(black_box(unit.as_bytes())); + } + }) + }); + + // Benchmark: std::hash::BuildHasher (SipHash) + group.bench_function("std::hash::BuildHasher (SipHash)", |b| { + let std_builder = std::collections::hash_map::RandomState::new(); + b.iter(|| { + for unit in units { + let mut hasher = std_builder.build_hasher(); + hasher.write(unit.as_bytes()); + let _hash = black_box(hasher.finish()); + } + }) + }); + + // Benchmark: aHash (hash_one) + group.bench_function("aHash (hash_one)", |b| { + let hash_builder = RandomState::with_seed(42); b.iter(|| { - let unit = units[index]; - let _hash = sz_checksum(unit.as_bytes()); - index = (index + 1) % units.len(); + for unit in units { + let _hash = black_box(hash_builder.hash_one(unit.as_bytes())); + } }) }); - // Benchmark StringZilla hashing - let mut index = 0; - g.bench_function("stringzilla::hash", |b| { + // Benchmark: xxHash (xxh3) + group.bench_function("xxh3", |b| { b.iter(|| { - let unit = units[index]; - let _hash = sz_hash(unit.as_bytes()); - index = (index + 1) % units.len(); + for unit in units { + let _hash = black_box(xxh3_64(unit.as_bytes())); + } }) }); - // Benchmark aHash - let mut index = 0; - let ahash_builder = ahash::RandomState::new(); - g.bench_function("aHash", |b| { + // Benchmark: Blake3 + group.bench_function("blake3", |b| { b.iter(|| { - let unit = units[index]; - let mut hasher = ahash_builder.build_hasher(); - hasher.write(unit.as_bytes()); - let _hash = hasher.finish(); - index = (index + 1) % units.len(); + for unit in units { + let _hash = black_box(blake3::hash(unit.as_bytes())); + } }) }); - // Benchmark xxHash (xxh3) - let mut index = 0; - g.bench_function("xxh3", |b| { + // Benchmark: gxhash + group.bench_function("gxhash", |b| { b.iter(|| { - let unit = units[index]; - let _hash = xxh3_64(unit.as_bytes()); - index = (index + 1) % units.len(); + for unit in units { + let _hash = black_box(gxhash::gxhash64(unit.as_bytes(), 42)); + } }) }); } -criterion_group! { - name = bench_hash_group; - config = configure_bench(); - targets = bench_hash +fn main() { + // Log the library version info before running benchmarks. + let sz_v = sz_version(); + println!( + "StringZilla version: {}.{}.{}", + sz_v.major, sz_v.minor, sz_v.patch + ); + println!( + "StringZilla uses dynamic dispatch: {}", + sz_dynamic_dispatch() + ); + println!("StringZilla capabilities: {}", sz_capabilities().as_str()); + + // Create a Criterion instance using any desired configuration. + let mut criterion = Criterion::default().configure_from_args(); + bench_hash(&mut criterion); + criterion.final_summary(); } -criterion_main!(bench_hash_group); From 39dbeb05242f954170ca6f1f878788416bb8f9f9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 9 Mar 2025 05:41:26 +0000 Subject: [PATCH 06/29] Docs: File-level documentation --- bench_find.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/bench_find.rs b/bench_find.rs index 3de9c5c..6756303 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -1,3 +1,25 @@ +//! # StringWa.rs Search Benchmarks +//! +//! This file benchmarks the forward and reverse search functionality provided by +//! the StringZilla library and the memchr crate. The benchmarks read an input file +//! (specified by the `STRINGWARS_DATASET` environment variable), tokenize its contents +//! by whitespace into search needles, and then run forward and reverse search benchmarks. +//! +//! ## Usage +//! +//! Set the environment variable `STRINGWARS_DATASET` to the path of your input file. +//! Then run the benchmarks with: +//! +//! ```sh +//! STRINGWARS_DATASET= cargo bench --features bench_search +//! ``` +//! +//! ## Library Metadata +//! +//! Before running the benchmarks, this binary logs the StringZilla metadata (version, +//! dynamic dispatch status, and capabilities) so that you can verify that the library +//! is configured correctly for your CPU. +//! use std::env; use std::fs; From fd5cb5fcb8071d57278e46c26c8c5d5403e7fa99 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 9 Mar 2025 05:42:42 +0000 Subject: [PATCH 07/29] Add: StringZilla version logging --- bench_find.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 6756303..066a9ee 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -26,7 +26,24 @@ use std::fs; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use memchr::memmem; -use stringzilla::StringZilla; +use stringzilla::sz::{ + capabilities as sz_capabilities, // + dynamic_dispatch as sz_dynamic_dispatch, + version as sz_version, +}; + +fn log_stringzilla_metadata() { + let sz_v = sz_version(); + println!( + "StringZilla version: {}.{}.{}", + sz_v.major, sz_v.minor, sz_v.patch + ); + println!( + "StringZilla uses dynamic dispatch: {}", + sz_dynamic_dispatch() + ); + println!("StringZilla capabilities: {}", sz_capabilities().as_str()); +} fn configure_bench() -> Criterion { Criterion::default() @@ -139,9 +156,9 @@ fn perform_reverse_benchmarks( }); } -criterion_group! { - name = bench_find_group; - config = configure_bench(); - targets = bench_find +fn main() { + log_stringzilla_metadata(); + let mut criterion = configure_bench(); + bench_find(&mut criterion); + criterion.final_summary(); } -criterion_main!(bench_find_group); From 11c6f1a2a18aa6d526c57e9a3c488ca9af62fb3e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 9 Mar 2025 05:43:14 +0000 Subject: [PATCH 08/29] Improve: Cycling through data over indexing --- bench_find.rs | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 066a9ee..35e6dfe 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -85,31 +85,29 @@ fn perform_forward_benchmarks( needles: &[&str], haystack: &[u8], ) { - // Benchmark for StringZilla forward search - let mut token_index: usize = 0; + // Benchmark for StringZilla forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); g.bench_function("stringzilla::find", |b| { b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; - while let Some(found) = (&haystack[pos..]).sz_find(token_bytes) { + while let Some(found) = sz::find(&haystack[pos..], token_bytes) { pos += found + token_bytes.len(); } - token_index = (token_index + 1) % needles.len(); }) }); - // Benchmark for memchr (forward search) - let mut token_index: usize = 0; // Reset token index for the next benchmark + // Benchmark for memmem forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); g.bench_function("memmem::find", |b| { b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; while let Some(found) = memmem::find(&haystack[pos..], token_bytes) { pos += found + token_bytes.len(); } - token_index = (token_index + 1) % needles.len(); }) }); } @@ -119,39 +117,37 @@ fn perform_reverse_benchmarks( needles: &[&str], haystack: &[u8], ) { - // Benchmark for StringZilla reverse search - let mut token_index: usize = 0; + // Benchmark for StringZilla reverse search using a cycle iterator. + let mut tokens = needles.iter().cycle(); g.bench_function("stringzilla::rfind", |b| { b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { - if let Some(found) = (&haystack[..end]).sz_rfind(token_bytes) { - pos = Some(found); // Update position to the start of the found token for the next search. + if let Some(found) = sz::rfind(&haystack[..end], token_bytes) { + pos = Some(found); } else { - break; // No more occurrences found. + break; } } - token_index = (token_index + 1) % needles.len(); }) }); - // Benchmark for memchr reverse search - let mut token_index: usize = 0; + // Benchmark for memmem reverse search using a cycle iterator. + let mut tokens = needles.iter().cycle(); g.bench_function("memmem::rfind", |b| { b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { if let Some(found) = memmem::rfind(&haystack[..end], token_bytes) { - pos = Some(found); // Update position to the start of the found token for the next search. + pos = Some(found); } else { - break; // No more occurrences found. + break; } } - token_index = (token_index + 1) % needles.len(); }) }); } From 7fba12f16b3332bc19c089bc92a691e046350a64 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 9 Mar 2025 05:43:33 +0000 Subject: [PATCH 09/29] Add: Sorting drafts --- bench_find.rs | 8 ++-- bench_sort.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 bench_sort.rs diff --git a/bench_find.rs b/bench_find.rs index 35e6dfe..da82d6e 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -22,9 +22,9 @@ //! use std::env; use std::fs; +use std::time::Duration; -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; - +use criterion::{black_box, Criterion, Throughput}; use memchr::memmem; use stringzilla::sz::{ capabilities as sz_capabilities, // @@ -48,8 +48,8 @@ fn log_stringzilla_metadata() { fn configure_bench() -> Criterion { Criterion::default() .sample_size(1000) // Test this many needles. - .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. - .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. + .warm_up_time(Duration::from_secs(10)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(120)) // Actual measurement time. } fn bench_find(c: &mut Criterion) { diff --git a/bench_sort.rs b/bench_sort.rs new file mode 100644 index 0000000..21faed2 --- /dev/null +++ b/bench_sort.rs @@ -0,0 +1,101 @@ +//! # Sorting Benchmarks +//! +//! This file benchmarks the performance of three different sorting routines for +//! arrays of strings: +//! +//! - `sz::sort` from the StringZilla library +//! - The standard library’s `sort_unstable` +//! - Rayon’s parallel sort (`par_sort_unstable`) +//! +//! ## Environment Variables +//! +//! The benchmarks use two environment variables to control the input dataset and mode: +//! +//! - `STRINGWARS_DATASET`: Path to the input dataset file. +//! - `STRINGWARS_MODE`: Specifies how to interpret the input. Allowed values: +//! - `lines`: Process the dataset line by line. +//! - `words`: Process the dataset word by word. +//! - `file`: Process the entire file as a single unit. +//! +//! ## Usage Example +//! +//! ```sh +//! STRINGWARS_MODE=lines STRINGWARS_DATASET=path/to/dataset cargo bench --bench bench_sort +//! ``` + +use std::env; +use std::fs; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rayon::prelude::*; + +// Import the specialized sort from StringZilla. It is assumed that `sz::sort` +// sorts a mutable slice of `String` in place. +use stringzilla::sz::sort as sz_sort; + +fn load_data() -> Vec { + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + + let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); + let data: Vec = match mode.as_str() { + "lines" => content.lines().map(|line| line.to_string()).collect(), + "words" => content + .split_whitespace() + .map(|word| word.to_string()) + .collect(), + "file" => vec![content], + other => panic!( + "Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.", + other + ), + }; + data +} + +fn bench_sort(c: &mut Criterion) { + // Load the dataset once; each benchmark iteration will clone this unsorted data. + let unsorted = load_data(); + + if unsorted.is_empty() { + panic!("No data found in dataset for sorting benchmark."); + } + + let mut group = c.benchmark_group("sorting"); + + // Benchmark: Specialized sort from StringZilla. + group.bench_function("sz::sort", |b| { + b.iter(|| { + // Clone to ensure each sort works on an unsorted vector. + let mut data = unsorted.clone(); + // Perform the specialized sort. + sz_sort(black_box(&mut data)); + black_box(&data); + }) + }); + + // Benchmark: Standard library sort_unstable. + group.bench_function("std::sort_unstable", |b| { + b.iter(|| { + let mut data = unsorted.clone(); + data.sort_unstable(); + black_box(&data); + }) + }); + + // Benchmark: Rayon parallel sort_unstable. + group.bench_function("rayon::par_sort_unstable", |b| { + b.iter(|| { + let mut data = unsorted.clone(); + // Parallel sort requires the `rayon` crate and the ParallelSliceMut trait. + data.par_sort_unstable(); + black_box(&data); + }) + }); + + group.finish(); +} + +criterion_group!(benches, bench_sort); +criterion_main!(benches); From 934af12afb5a7f266b1abceaa13346707ef2e5d3 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 12:57:57 +0000 Subject: [PATCH 10/29] Improve: Unified design --- .vscode/settings.json | 5 +- Cargo.toml | 3 +- README.md | 10 ++-- bench_tfidf.rs => bench_feature.rs | 0 bench_find.rs | 75 +++++++++++++----------- bench_hash.rs | 57 ++++++++---------- bench_levenshtein.rs | 92 ++++++++++++++++++++++++------ bench_sort.rs | 23 ++++---- 8 files changed, 163 insertions(+), 102 deletions(-) rename bench_tfidf.rs => bench_feature.rs (100%) diff --git a/.vscode/settings.json b/.vscode/settings.json index b57ebfd..38ece84 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,13 +1,16 @@ { "cSpell.words": [ "ahash", + "Bioinformatics", "bytesum", "memchr", "memmem", + "Needleman", "rapidfuzz", "rfind", "stringwars", "stringzilla", - "tfidf" + "tfidf", + "Wunsch" ] } \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ff02ba5..c5f6c5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,13 +7,14 @@ edition = "2018" rand = "0.8.5" criterion = "0.5.1" # stringzilla = { version = "3.3.0" } -stringzilla = { path = "../StringZilla-dev" } +stringzilla = { path = "../StringZilla/" } # Feature-based dependencies for benchmarks [features] bench_find = ["memchr"] bench_levenshtein = ["rapidfuzz"] bench_hash = ["ahash", "xxhash-rust", "blake3", "gxhash"] +# bench_sequence = ["arrow-rs", "rayon"] [dependencies.memchr] version = "2.7.1" diff --git a/README.md b/README.md index c259f79..22c5903 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,8 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable - Edit Distance: ```bash - STRINGWARS_MODE=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 - STRINGWARS_MODE=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + STRINGWARS_TOKENS=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 + STRINGWARS_TOKENS=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 ``` Edit distance benchmarks compute the Levenshtein distance between consecutive pairs of whitespace-delimited words or newline-delimited lines. @@ -78,9 +78,9 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable - Hashing: ```bash - STRINGWARS_MODE=file STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 - STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 - STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + STRINGWARS_TOKENS=file STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + STRINGWARS_TOKENS=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 + STRINGWARS_TOKENS=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 ``` - Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf): diff --git a/bench_tfidf.rs b/bench_feature.rs similarity index 100% rename from bench_tfidf.rs rename to bench_feature.rs diff --git a/bench_find.rs b/bench_find.rs index da82d6e..8439773 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -1,48 +1,48 @@ -//! # StringWa.rs Search Benchmarks +//! # StringWa.rs: Substring Search Benchmarks //! -//! This file benchmarks the forward and reverse search functionality provided by -//! the StringZilla library and the memchr crate. The benchmarks read an input file -//! (specified by the `STRINGWARS_DATASET` environment variable), tokenize its contents -//! by whitespace into search needles, and then run forward and reverse search benchmarks. +//! This file benchmarks the forward and reverse exact substring search functionality provided by +//! the StringZilla library and the memchr crate. The input file is treated as a haystack and all +//! of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for +//! any sampled token - all of its inclusions in a string are located. //! -//! ## Usage +//! ## Usage Examples //! -//! Set the environment variable `STRINGWARS_DATASET` to the path of your input file. -//! Then run the benchmarks with: +//! The benchmarks use two environment variables to control the input dataset and mode: //! -//! ```sh -//! STRINGWARS_DATASET= cargo bench --features bench_search -//! ``` +//! - `STRINGWARS_DATASET`: Path to the input dataset file. +//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: +//! - `lines`: Process the dataset line by line. +//! - `words`: Process the dataset word by word. //! -//! ## Library Metadata -//! -//! Before running the benchmarks, this binary logs the StringZilla metadata (version, -//! dynamic dispatch status, and capabilities) so that you can verify that the library -//! is configured correctly for your CPU. +//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: //! +//! ```sh +//! RUSTFLAGS="-C target-cpu=native" \ +//! STRINGWARS_DATASET=README.md \ +//! STRINGWARS_TOKENS=lines \ +//! cargo criterion --features bench_find bench_find --jobs 8 +//! ``` use std::env; use std::fs; use std::time::Duration; use criterion::{black_box, Criterion, Throughput}; + use memchr::memmem; +use stringzilla::sz::{find as sz_find, rfind as sz_rfind}; + use stringzilla::sz::{ - capabilities as sz_capabilities, // + // Pull some metadata logging functionality + capabilities as sz_capabilities, dynamic_dispatch as sz_dynamic_dispatch, version as sz_version, }; fn log_stringzilla_metadata() { - let sz_v = sz_version(); - println!( - "StringZilla version: {}.{}.{}", - sz_v.major, sz_v.minor, sz_v.patch - ); - println!( - "StringZilla uses dynamic dispatch: {}", - sz_dynamic_dispatch() - ); - println!("StringZilla capabilities: {}", sz_capabilities().as_str()); + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); } fn configure_bench() -> Criterion { @@ -56,10 +56,19 @@ fn bench_find(c: &mut Criterion) { // Get the haystack path from the environment variable. let dataset_path = env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack"); - // Tokenize the haystack content by white space. - let needles: Vec<&str> = haystack_content.split_whitespace().collect(); + // Tokenize the haystack content by white space or lines. + let needles: Vec<&str> = match mode.as_str() { + "lines" => haystack_content.lines().collect(), + "words" => haystack_content.split_whitespace().collect(), + other => panic!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", + other + ), + }; + if needles.is_empty() { panic!("No tokens found in the haystack."); } @@ -87,12 +96,12 @@ fn perform_forward_benchmarks( ) { // Benchmark for StringZilla forward search using a cycle iterator. let mut tokens = needles.iter().cycle(); - g.bench_function("stringzilla::find", |b| { + g.bench_function("sz::find", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; - while let Some(found) = sz::find(&haystack[pos..], token_bytes) { + while let Some(found) = sz_find(&haystack[pos..], token_bytes) { pos += found + token_bytes.len(); } }) @@ -119,13 +128,13 @@ fn perform_reverse_benchmarks( ) { // Benchmark for StringZilla reverse search using a cycle iterator. let mut tokens = needles.iter().cycle(); - g.bench_function("stringzilla::rfind", |b| { + g.bench_function("sz::rfind", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); let token_bytes = black_box(token.as_bytes()); let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { - if let Some(found) = sz::rfind(&haystack[..end], token_bytes) { + if let Some(found) = sz_rfind(&haystack[..end], token_bytes) { pos = Some(found); } else { break; diff --git a/bench_hash.rs b/bench_hash.rs index 2860903..18f71dd 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -1,41 +1,38 @@ -//! # StringWa.rs Hashing Benchmarks +//! # StringWa.rs: String Hashing Benchmarks //! //! This file contains benchmarks for various Rust hashing libraries using Criterion. //! //! The benchmarks compare the performance of different hash functions including: //! -//! - StringZilla (`bytesum`, `hash`, and incremental `hash` variants) +//! - Standard `Hash` implementation +//! - StringZilla (`bytesum`, `hash`, and incremental `hash` function variants) //! - aHash (both incremental and single-entry variants) -//! - gxhash (gxhash64) -//! - Blake3 (default cryptographic hash) //! - xxHash (xxh3) through the third-party `xxhash-rust` crate +//! - gxhash (gxhash64) +//! - Blake3 (the only cryptographic hash in the comparison, for reference) //! -//! ## Environment Variables +//! ## Usage Examples //! //! The benchmarks use two environment variables to control the input dataset and mode: //! //! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_MODE`: Specifies how to interpret the input. Allowed values: +//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: //! - `lines`: Process the dataset line by line. //! - `words`: Process the dataset word by word. //! - `file`: Process the entire file as a single unit. //! -//! You should also set the `RUSTFLAGS` environment variable to enable the appropriate CPU features. -//! -//! ## Usage Examples -//! //! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: //! //! ```sh -//! STRINGWARS_MODE=file STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 -//! STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 -//! STRINGWARS_MODE=words STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_hash bench_hash --jobs 8 +//! RUSTFLAGS="-C target-cpu=native" \ +//! STRINGWARS_DATASET=README.md \ +//! STRINGWARS_TOKENS=lines \ +//! cargo criterion --features bench_hash bench_hash --jobs 8 //! ``` //! //! ## Notes //! //! - Ensure your CPU supports the required AES and SSE2 instructions when using `gxhash`. -//! - The benchmarks aggregate hashing over the dataset for more realistic throughput measurements. use std::env; use std::fs; @@ -45,14 +42,22 @@ use ahash::RandomState; use blake3; use gxhash; use std::hash::{BuildHasher, Hasher}; +use stringzilla::sz::{bytesum as sz_bytesum, hash as sz_hash}; +use xxhash_rust::xxh3::xxh3_64; + use stringzilla::sz::{ - bytesum as sz_bytesum, // + // Pull some metadata logging functionality capabilities as sz_capabilities, dynamic_dispatch as sz_dynamic_dispatch, - hash as sz_hash, version as sz_version, }; -use xxhash_rust::xxh3::xxh3_64; + +fn log_stringzilla_metadata() { + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); +} fn configure_bench() -> Criterion { Criterion::default() @@ -63,7 +68,7 @@ fn configure_bench() -> Criterion { fn bench_hash(c: &mut Criterion) { let dataset_path = env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); - let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); let units: Vec<&str> = match mode.as_str() { @@ -74,7 +79,7 @@ fn bench_hash(c: &mut Criterion) { vec![&content] } other => panic!( - "Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.", + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", other ), }; @@ -165,19 +170,7 @@ fn perform_hashing_benchmarks( } fn main() { - // Log the library version info before running benchmarks. - let sz_v = sz_version(); - println!( - "StringZilla version: {}.{}.{}", - sz_v.major, sz_v.minor, sz_v.patch - ); - println!( - "StringZilla uses dynamic dispatch: {}", - sz_dynamic_dispatch() - ); - println!("StringZilla capabilities: {}", sz_capabilities().as_str()); - - // Create a Criterion instance using any desired configuration. + log_stringzilla_metadata(); let mut criterion = Criterion::default().configure_from_args(); bench_hash(&mut criterion); criterion.final_summary(); diff --git a/bench_levenshtein.rs b/bench_levenshtein.rs index f997858..8a6c691 100644 --- a/bench_levenshtein.rs +++ b/bench_levenshtein.rs @@ -1,9 +1,60 @@ +//! # StringWa.rs: String Similarity Benchmarks +//! +//! This file benchmarks different libraries implementing string alignment and edit +//! distance calculation, for both generic Levenshtein distances and the weighted +//! Needleman-Wunsch alignment scores used in Bioinformatics. +//! +//! The input file is tokenized into lines or words and each consecutive pair of tokens +//! is evaluated for similarity. As most algorithms have quadratic complexity and use +//! Dynamic Programming techniques, their throughput is evaluate in the number of CUPS, +//! or Cell Updates Per Second. +//! +//! ## Usage Examples +//! +//! The benchmarks use two environment variables to control the input dataset and mode: +//! +//! - `STRINGWARS_DATASET`: Path to the input dataset file. +//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: +//! - `lines`: Process the dataset line by line. +//! - `words`: Process the dataset word by word. +//! - `STRINGWARS_ERROR_BOUND`: Maximum error bound, defined as an integer percent. +//! +//! ```sh +//! RUSTFLAGS="-C target-cpu=native" \ +//! STRINGWARS_DATASET=README.md \ +//! STRINGWARS_ERROR_BOUND=15 \ +//! STRINGWARS_TOKENS=lines \ +//! cargo criterion --features bench_similarity bench_similarity --jobs 8 +//! ``` +//! use std::env; use std::fs; use criterion::{criterion_group, criterion_main, Criterion}; + use rapidfuzz::distance::levenshtein; -use stringzilla::StringZilla; +use stringzilla::sz::{ + alignment_score as sz_alignment_score, // + levenshtein_distance as sz_levenshtein_distance, + levenshtein_distance_bounded as sz_levenshtein_distance_bounded, + levenshtein_distance_utf8 as sz_levenshtein_distance_utf8, + levenshtein_distance_utf8_bounded as sz_levenshtein_distance_utf8_bounded, + unary_substitution_costs as sz_unary_substitution_costs, +}; + +use stringzilla::sz::{ + // Pull some metadata logging functionality + capabilities as sz_capabilities, + dynamic_dispatch as sz_dynamic_dispatch, + version as sz_version, +}; + +fn log_stringzilla_metadata() { + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); +} fn configure_bench() -> Criterion { Criterion::default() @@ -15,7 +66,7 @@ fn configure_bench() -> Criterion { fn bench_levenshtein(c: &mut Criterion) { let dataset_path = env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); - let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); let bound_percent = env::var("STRINGWARS_ERROR_BOUND") @@ -32,7 +83,7 @@ fn bench_levenshtein(c: &mut Criterion) { "words" => content.split_whitespace().collect(), "lines" => content.lines().collect(), other => panic!( - "Unknown STRINGWARS_MODE: {}. Use 'lines' or 'words'.", + "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", other ), }; @@ -60,6 +111,15 @@ fn bench_levenshtein(c: &mut Criterion) { pairs.truncate(max_pairs); } + // In "unbounded" benchmarks we report the total number of Dynamic + // Programming (DP) matrix evaluated by the algorithm, aka "CUPS". + let mut g = c.benchmark_group("unbounded"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_unbounded_benchmarks(&mut g, &pairs, &pair_bounds); + g.finish(); + + // In case of "bounded" benchmarks, only one band of the DP matrix + // needs to be evaluated, so the throughput is computed differently. let pair_bounds: Vec = pairs .iter() .map(|(a, b)| { @@ -67,15 +127,9 @@ fn bench_levenshtein(c: &mut Criterion) { ((max_len as u64 * bound_percent) / 100) as usize }) .collect(); - - let mut g = c.benchmark_group("levenshtein"); - - perform_levenshtein_benchmarks(&mut g, &pairs, &pair_bounds); - - g.finish(); } -fn perform_levenshtein_benchmarks( +fn perform_unbounded_benchmarks( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, pairs: &[(&str, &str)], pair_bounds: &[usize], @@ -83,7 +137,7 @@ fn perform_levenshtein_benchmarks( // StringZilla, bytes-based, unbounded { let mut pair_index = 0; - g.bench_function("stringzilla::levenshtein_bytes_unbounded", |b| { + g.bench_function("sz::levenshtein_bytes_unbounded", |b| { b.iter(|| { let (a, b_str) = pairs[pair_index]; let _distance = a.sz_edit_distance(b_str.as_bytes()); @@ -95,7 +149,7 @@ fn perform_levenshtein_benchmarks( // StringZilla, bytes-based, bounded { let mut pair_index = 0; - g.bench_function("stringzilla::levenshtein_bytes_bounded", |b| { + g.bench_function("sz::levenshtein_bytes_bounded", |b| { b.iter(|| { let (a, b_str) = pairs[pair_index]; let bound = pair_bounds[pair_index]; @@ -110,7 +164,7 @@ fn perform_levenshtein_benchmarks( // StringZilla, UTF-8, unbounded { let mut pair_index = 0; - g.bench_function("stringzilla::levenshtein_utf8_unbounded", |b| { + g.bench_function("sz::levenshtein_utf8_unbounded", |b| { b.iter(|| { let (a, b_str) = pairs[pair_index]; let _distance = a.as_bytes().sz_edit_distance_utf8(b_str.as_bytes()); @@ -122,7 +176,7 @@ fn perform_levenshtein_benchmarks( // StringZilla, UTF-8, bounded { let mut pair_index = 0; - g.bench_function("stringzilla::levenshtein_utf8_bounded", |b| { + g.bench_function("sz::levenshtein_utf8_bounded", |b| { b.iter(|| { let (a, b_str) = pairs[pair_index]; let bound = pair_bounds[pair_index]; @@ -193,9 +247,9 @@ fn perform_levenshtein_benchmarks( } } -criterion_group! { - name = bench_levenshtein_group; - config = configure_bench(); - targets = bench_levenshtein +fn main() { + log_stringzilla_metadata(); + let mut criterion = configure_bench(); + bench_levenshtein(&mut criterion); + criterion.final_summary(); } -criterion_main!(bench_levenshtein_group); diff --git a/bench_sort.rs b/bench_sort.rs index 21faed2..79f45dd 100644 --- a/bench_sort.rs +++ b/bench_sort.rs @@ -1,26 +1,28 @@ -//! # Sorting Benchmarks +//! # StringWa.rs: String Sorting Benchmarks //! -//! This file benchmarks the performance of three different sorting routines for -//! arrays of strings: +//! This file benchmarks various libraries for processing string-identifiable collections. +//! Including sorting arrays of strings: //! //! - `sz::sort` from the StringZilla library //! - The standard library’s `sort_unstable` //! - Rayon’s parallel sort (`par_sort_unstable`) //! -//! ## Environment Variables +//! Intersecting string collections, similar to "STRICT INNER JOIN" in SQL databases. +//! +//! ## Usage Example //! //! The benchmarks use two environment variables to control the input dataset and mode: //! //! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_MODE`: Specifies how to interpret the input. Allowed values: +//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: //! - `lines`: Process the dataset line by line. //! - `words`: Process the dataset word by word. -//! - `file`: Process the entire file as a single unit. //! -//! ## Usage Example +//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: //! //! ```sh -//! STRINGWARS_MODE=lines STRINGWARS_DATASET=path/to/dataset cargo bench --bench bench_sort +//! STRINGWARS_TOKENS=lines STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_sort bench_sort --jobs 8 +//! STRINGWARS_TOKENS=words STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_sort bench_sort --jobs 8 //! ``` use std::env; @@ -36,7 +38,7 @@ use stringzilla::sz::sort as sz_sort; fn load_data() -> Vec { let dataset_path = env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); - let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string()); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); let data: Vec = match mode.as_str() { @@ -45,9 +47,8 @@ fn load_data() -> Vec { .split_whitespace() .map(|word| word.to_string()) .collect(), - "file" => vec![content], other => panic!( - "Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.", + "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", other ), }; From c0ef9ea2d442a44f161b77257790fd80d5b9d807 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:47:04 +0000 Subject: [PATCH 11/29] Make: Drop legacy helpers --- src/lib.rs | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 src/lib.rs diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 83bd5a8..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,12 +0,0 @@ -use rand::{rngs::ThreadRng, Rng}; - -pub fn open() -> Vec { - let path = std::env::var("FILE").unwrap_or_default(); - let file = std::fs::read(path).unwrap(); - file -} - -pub fn random_token<'a, 'b>(rng: &'b mut ThreadRng, file: &'a Vec) -> &'a [u8] { - let tokens: Vec<&[u8]> = file.split(|c| *c == b' ').collect(); - tokens[rng.gen_range(0..tokens.len())] -} From 2bf46adee0d83b4b89faf6fe838d1615bdbf49f0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:47:20 +0000 Subject: [PATCH 12/29] Improve: Put slowest hash in the end --- bench_hash.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bench_hash.rs b/bench_hash.rs index 18f71dd..954c87b 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -150,20 +150,20 @@ fn perform_hashing_benchmarks( }) }); - // Benchmark: Blake3 - group.bench_function("blake3", |b| { + // Benchmark: gxhash + group.bench_function("gxhash", |b| { b.iter(|| { for unit in units { - let _hash = black_box(blake3::hash(unit.as_bytes())); + let _hash = black_box(gxhash::gxhash64(unit.as_bytes(), 42)); } }) }); - // Benchmark: gxhash - group.bench_function("gxhash", |b| { + // Benchmark: Blake3 - should be by far the slowest, as it's a cryptographic hash. + group.bench_function("blake3", |b| { b.iter(|| { for unit in units { - let _hash = black_box(gxhash::gxhash64(unit.as_bytes(), 42)); + let _hash = black_box(blake3::hash(unit.as_bytes())); } }) }); From 3dc097d467ede3a7a2fa0ba741bbcd0b6e3def2e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:47:44 +0000 Subject: [PATCH 13/29] Add: Native substring search benchmarks --- bench_find.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/bench_find.rs b/bench_find.rs index 8439773..27d93b0 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -107,7 +107,7 @@ fn perform_forward_benchmarks( }) }); - // Benchmark for memmem forward search using a cycle iterator. + // Benchmark for `memmem` forward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("memmem::find", |b| { b.iter(|| { @@ -119,6 +119,18 @@ fn perform_forward_benchmarks( } }) }); + + // Benchmark for default `std::str` forward search. + let mut tokens = needles.iter().cycle(); + g.bench_function("std::str::find", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos = 0; + while let Some(found) = haystack_str[pos..].find(token) { + pos += found + token.len(); + } + }) + }); } fn perform_reverse_benchmarks( @@ -159,6 +171,22 @@ fn perform_reverse_benchmarks( } }) }); + + // Benchmark for default `std::str` reverse search. + let mut tokens = needles.iter().cycle(); + g.bench_function("std::str::rfind", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: Option = Some(haystack_str.len()); + while let Some(end) = pos { + if let Some(found) = haystack_str[..end].rfind(token) { + pos = Some(found); + } else { + break; + } + } + }) + }); } fn main() { From ebe8af93e08d294b92d159d25d326a1e5ad82b37 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:47:54 +0000 Subject: [PATCH 14/29] Docs: Hashing throughput --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 22c5903..9ab03aa 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,31 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv Of course, the functionality of the projects is different, as are the APIs and the usage patterns. So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations. +## String Hashing Benchmarks + +Many great hashing libraries exist in Rust, C, and C++. +Typical top choices are `aHash`, `xxHash`, `blake3`, `gxhash`, `CityHash`, `MurmurHash`, or the native `std::hash`. +Many of them have similar pitfalls: + +- They are not always documented to have a certain reproducible output and are recommended for use only for local in-memory construction of hash tables, not for serialization or network communication. +- They don't always support streaming and require the whole input to be available in memory at once. +- They rarely benefit from predicated SIMD instructions on modern hardware like AVX-512 on x86 or SVE on Arm. +- They don't always pass the SMHasher test suite, especially with `--extra` checks enabled. + +StringZilla addresses those issues and seems to provide competitive performance. +On Intel Sapphire Rapids CPU, on `xlsum.csv` dataset, the following numbers can be expected for hashing individual whitespace-delimited words and newline-delimited lines: + +| Benchmark | Lines | Words | +| ---------------------- | ----------: | ---------: | +| `std::hash` (SipHash) | 3.74 GiB/s | 0.43 GiB/s | +| `stringzilla::bytesum` | 11.65 GiB/s | 2.16 GiB/s | +| `stringzilla::hash` | 11.23 GiB/s | 1.84 GiB/s | +| `aHash::hash_one` | 8.61 GiB/s | 1.23 GiB/s | +| `xxh3` | 9.48 GiB/s | 1.08 GiB/s | +| `blake3` | 1.97 GiB/s | N/A GiB/s | +| `gxhash` | 10.81 GiB/s | N/A GiB/s | + + ## Substring Search Benchmarks Substring search is one of the most common operations in text processing, and one of the slowest. From 5d5469e6e94fe9a8e069c57dc4fe6f96b10c677e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:18:28 +0000 Subject: [PATCH 15/29] Improve: Rename files --- bench_sort.rs => bench_sequence.rs | 0 bench_levenshtein.rs => bench_similarity.rs | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename bench_sort.rs => bench_sequence.rs (100%) rename bench_levenshtein.rs => bench_similarity.rs (100%) diff --git a/bench_sort.rs b/bench_sequence.rs similarity index 100% rename from bench_sort.rs rename to bench_sequence.rs diff --git a/bench_levenshtein.rs b/bench_similarity.rs similarity index 100% rename from bench_levenshtein.rs rename to bench_similarity.rs From 65cc43ded011b1afa00554cb8c191b9228458561 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:18:45 +0000 Subject: [PATCH 16/29] Add: Byteset search benchmarks --- bench_find.rs | 165 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 15 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 27d93b0..4c42392 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -1,10 +1,18 @@ //! # StringWa.rs: Substring Search Benchmarks //! -//! This file benchmarks the forward and reverse exact substring search functionality provided by +//! This file benchmarks the forward and backward exact substring search functionality provided by //! the StringZilla library and the memchr crate. The input file is treated as a haystack and all //! of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for //! any sampled token - all of its inclusions in a string are located. //! +//! The input file is treated as a haystack and all of its tokens as needles. For substring searches, +//! each occurrence is located. For byteset searches, three separate operations are performed per token, +//! looking for: +//! +//! - any of "\n\r\v\f" - the 4 tabulation characters +//! - any of "&'\"=[]" - the 9 HTML-related characters +//! - any of "0123456789" - the 10 numeric characters +//! //! ## Usage Examples //! //! The benchmarks use two environment variables to control the input dataset and mode: @@ -28,8 +36,16 @@ use std::time::Duration; use criterion::{black_box, Criterion, Throughput}; +use aho_corasick::AhoCorasick; +use bstr::ByteSlice; use memchr::memmem; -use stringzilla::sz::{find as sz_find, rfind as sz_rfind}; +use regex::bytes::Regex; +use stringzilla::sz::{ + find as sz_find, + find_byteset as sz_find_byteset, // + rfind as sz_rfind, + Byteset, +}; use stringzilla::sz::{ // Pull some metadata logging functionality @@ -77,19 +93,25 @@ fn bench_find(c: &mut Criterion) { let haystack_length = haystack.len(); // Benchmarks for forward search - let mut g = c.benchmark_group("search-forward"); + let mut g = c.benchmark_group("substring-forward"); g.throughput(Throughput::Bytes(haystack_length as u64)); - perform_forward_benchmarks(&mut g, &needles, haystack); + bench_substring_forward(&mut g, &needles, haystack); g.finish(); - // Benchmarks for reverse search - let mut g = c.benchmark_group("search-reverse"); + // Benchmarks for backward search + let mut g = c.benchmark_group("substring-backward"); g.throughput(Throughput::Bytes(haystack_length as u64)); - perform_reverse_benchmarks(&mut g, &needles, haystack); + bench_substring_backward(&mut g, &needles, haystack); + g.finish(); + + // Benchmarks for byteset search + let mut g = c.benchmark_group("byteset-forward"); + g.throughput(Throughput::Bytes(3 * haystack_length as u64)); + bench_byteset_forward(&mut g, &needles); g.finish(); } -fn perform_forward_benchmarks( +fn bench_substring_forward( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, needles: &[&str], haystack: &[u8], @@ -126,19 +148,19 @@ fn perform_forward_benchmarks( b.iter(|| { let token = black_box(*tokens.next().unwrap()); let mut pos = 0; - while let Some(found) = haystack_str[pos..].find(token) { + while let Some(found) = haystack[pos..].find(token) { pos += found + token.len(); } }) }); } -fn perform_reverse_benchmarks( +fn bench_substring_backward( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, needles: &[&str], haystack: &[u8], ) { - // Benchmark for StringZilla reverse search using a cycle iterator. + // Benchmark for StringZilla backward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("sz::rfind", |b| { b.iter(|| { @@ -155,7 +177,7 @@ fn perform_reverse_benchmarks( }) }); - // Benchmark for memmem reverse search using a cycle iterator. + // Benchmark for memmem backward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("memmem::rfind", |b| { b.iter(|| { @@ -172,14 +194,14 @@ fn perform_reverse_benchmarks( }) }); - // Benchmark for default `std::str` reverse search. + // Benchmark for default `std::str` backward search. let mut tokens = needles.iter().cycle(); g.bench_function("std::str::rfind", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); - let mut pos: Option = Some(haystack_str.len()); + let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { - if let Some(found) = haystack_str[..end].rfind(token) { + if let Some(found) = haystack[..end].rfind(token) { pos = Some(found); } else { break; @@ -189,6 +211,119 @@ fn perform_reverse_benchmarks( }); } +fn bench_byteset_forward( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + needles: &[&str], +) { + // Define the three bytesets we will analyze. + const BYTES_TABS: &[u8] = b"\n\r\x0B\x0C"; + const BYTES_HTML: &[u8] = b"&'\"=[]"; + const BYTES_DIGITS: &[u8] = b"0123456789"; + + // Benchmark for StringZilla forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + let sz_tabs = Byteset::from(BYTES_TABS); + let sz_html = Byteset::from(BYTES_HTML); + let sz_digits = Byteset::from(BYTES_DIGITS); + g.bench_function("sz::find_byteset", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); + let mut pos: usize = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_tabs) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_html) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_digits) { + pos += found + 1; + } + }) + }); + + // Benchmark for bstr's byteset search. + let mut tokens = needles.iter().cycle(); + g.bench_function("bstr::iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let token_bytes = black_box(token.as_bytes()); + let mut pos: usize = 0; + // Inline search for `BYTES_TABS`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_TABS.contains(&c)) + { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_HTML`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_HTML.contains(&c)) + { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_DIGITS`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_DIGITS.contains(&c)) + { + pos += found + 1; + } + }) + }); + + // Benchmark for Regex-based byteset search. + let mut tokens = needles.iter().cycle(); + let re_tabs = Regex::new("[\n\r\x0B\x0C]").unwrap(); + let re_html = Regex::new("[&'\"=\\[\\]]").unwrap(); + let re_digits = Regex::new("[0-9]").unwrap(); + g.bench_function("regex::find_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + black_box(re_tabs.find_iter(token.as_bytes()).count()); + black_box(re_html.find_iter(token.as_bytes()).count()); + black_box(re_digits.find_iter(token.as_bytes()).count()); + }) + }); + + // Benchmark for Aho–Corasick-based byteset search. + let mut tokens: std::iter::Cycle> = needles.iter().cycle(); + let ac_tabs = AhoCorasick::new( + &BYTES_TABS + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + let ac_html = AhoCorasick::new( + &BYTES_HTML + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + let ac_digits = AhoCorasick::new( + &BYTES_DIGITS + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + g.bench_function("aho_corasick::find_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + black_box(ac_tabs.find_iter(token).count()); + black_box(ac_html.find_iter(token).count()); + black_box(ac_digits.find_iter(token).count()); + }) + }); +} + fn main() { log_stringzilla_metadata(); let mut criterion = configure_bench(); From a4d824712849b62956a9d7ea85cbc3384aa690da Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 19:56:50 +0000 Subject: [PATCH 17/29] Make: New dependencies --- Cargo.lock | 1169 ++++++++++++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 38 +- 2 files changed, 1118 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index de66ffb..e23cff7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,6 +9,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "const-random", "getrandom", "once_cell", "version_check", @@ -17,13 +18,28 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anes" version = "0.1.6" @@ -36,6 +52,21 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "arrayref" version = "0.3.9" @@ -48,12 +79,306 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" + +[[package]] +name = "arrow-select" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bio" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0875bce309de30d684a736aaf628bf9edbd3bdad896eb11b72a182155a5fb97f" +dependencies = [ + "anyhow", + "approx", + "bio-types", + "bit-set", + "bv", + "bytecount", + "csv", + "custom_derive", + "editdistancek", + "enum-map", + "fxhash", + "itertools", + "itertools-num", + "lazy_static", + "multimap", + "ndarray", + "newtype_derive", + "num-integer", + "num-traits", + "ordered-float", + "petgraph", + "rand", + "regex", + "serde", + "serde_derive", + "statrs", + "strum", + "strum_macros", + "thiserror 2.0.12", + "triple_accel", + "vec_map", +] + +[[package]] +name = "bio-types" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4dcf54f8b7f51450207d54780bab09c05f30b8b0caa991545082842e466ad7e" +dependencies = [ + "derive-new", + "lazy_static", + "regex", + "strum_macros", + "thiserror 1.0.69", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "blake3" version = "1.6.1" @@ -67,12 +392,55 @@ dependencies = [ "constant_time_eq", ] +[[package]] +name = "bstr" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" +dependencies = [ + "memchr", +] + [[package]] name = "bumpalo" version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +[[package]] +name = "bv" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" +dependencies = [ + "feature-probe", + "serde", +] + +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + +[[package]] +name = "bytemuck" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "cast" version = "0.3.0" @@ -94,6 +462,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -146,12 +526,38 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + [[package]] name = "constant_time_eq" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "criterion" version = "0.5.1" @@ -179,148 +585,505 @@ dependencies = [ ] [[package]] -name = "criterion-plot" -version = "0.5.0" +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + +[[package]] +name = "derive-new" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "editdistancek" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e02df23d5b1c6f9e69fa603b890378123b93073df998a21e6e33b9db0a32613" + +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "enum-map" +version = "2.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" +dependencies = [ + "enum-map-derive", +] + +[[package]] +name = "enum-map-derive" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "feature-probe" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags", + "rustc_version 0.4.1", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gxhash" +version = "3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" +dependencies = [ + "rustversion", +] + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" + +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3954d50fe15b02142bf25d3b8bdadb634ec3948f103d04ffe3031bc8fe9d7058" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools-num" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "js-sys" +version = "0.3.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ - "cast", - "itertools", + "lexical-parse-integer", + "lexical-util", + "static_assertions", ] [[package]] -name = "crossbeam-deque" -version = "0.8.5" +name = "lexical-parse-integer" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", + "lexical-util", + "static_assertions", ] [[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "lexical-util" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ - "crossbeam-utils", + "static_assertions", ] [[package]] -name = "crossbeam-utils" -version = "0.8.19" +name = "lexical-write-float" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] [[package]] -name = "crunchy" -version = "0.2.2" +name = "lexical-write-integer" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +dependencies = [ + "lexical-util", + "static_assertions", +] [[package]] -name = "either" -version = "1.10.0" +name = "libc" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] -name = "getrandom" -version = "0.2.12" +name = "libm" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "matrixmultiply" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ - "cfg-if", - "libc", - "wasi", + "autocfg", + "rawpointer", ] [[package]] -name = "gxhash" -version = "3.4.1" +name = "memchr" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "multimap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" dependencies = [ - "rustversion", + "serde", ] [[package]] -name = "half" -version = "2.3.1" +name = "nalgebra" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b" dependencies = [ - "cfg-if", - "crunchy", + "approx", + "matrixmultiply", + "num-complex", + "num-rational", + "num-traits", + "rand", + "rand_distr", + "simba", + "typenum", ] [[package]] -name = "hermit-abi" -version = "0.3.6" +name = "ndarray" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] [[package]] -name = "is-terminal" -version = "0.4.12" +name = "newtype_derive" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" dependencies = [ - "hermit-abi", - "libc", - "windows-sys", + "rustc_version 0.1.7", ] [[package]] -name = "itertools" -version = "0.10.5" +name = "num" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" dependencies = [ - "either", + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", ] [[package]] -name = "itoa" -version = "1.0.10" +name = "num-bigint" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] [[package]] -name = "js-sys" -version = "0.3.68" +name = "num-complex" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ - "wasm-bindgen", + "num-traits", ] [[package]] -name = "libc" -version = "0.2.153" +name = "num-integer" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] [[package]] -name = "log" -version = "0.4.20" +name = "num-iter" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] [[package]] -name = "memchr" -version = "2.7.1" +name = "num-rational" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -335,6 +1098,31 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "ordered-float" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01" +dependencies = [ + "num-traits", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "plotters" version = "0.3.5" @@ -363,6 +1151,21 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -371,9 +1174,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -417,17 +1220,33 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + [[package]] name = "rapidfuzz" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "270e04e5ea61d40841942bb15e451c29ee1618637bcf97fc7ede5dd4a9b1601b" +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -445,9 +1264,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -457,9 +1276,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -468,9 +1287,27 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc_version" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" +dependencies = [ + "semver 0.1.20", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver 1.0.26", +] [[package]] name = "rustversion" @@ -484,6 +1321,15 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "safe_arch" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" +dependencies = [ + "bytemuck", +] + [[package]] name = "same-file" version = "1.0.6" @@ -493,6 +1339,18 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.197" @@ -530,17 +1388,54 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simba" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "statrs" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" +dependencies = [ + "approx", + "nalgebra", + "num-traits", + "rand", +] + [[package]] name = "stringwars" version = "0.1.0" dependencies = [ "ahash", + "aho-corasick", + "arrow", + "bio", "blake3", + "bstr", "criterion", "gxhash", "memchr", "rand", "rapidfuzz", + "rayon", + "regex", "stringzilla", "xxhash-rust", ] @@ -552,17 +1447,85 @@ dependencies = [ "cc", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" -version = "2.0.50" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl 2.0.12", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -573,12 +1536,33 @@ dependencies = [ "serde_json", ] +[[package]] +name = "triple_accel" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +dependencies = [ + "serde", +] + [[package]] name = "version_check" version = "0.9.5" @@ -665,6 +1649,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "wide" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41b5576b9a81633f3e8df296ce0063042a73507636cbe956c61133dd7034ab22" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "winapi" version = "0.3.9" @@ -696,6 +1690,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index c5f6c5c..8e1aa5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,16 +11,34 @@ stringzilla = { path = "../StringZilla/" } # Feature-based dependencies for benchmarks [features] -bench_find = ["memchr"] -bench_levenshtein = ["rapidfuzz"] +bench_find = ["memchr", "bstr", "aho-corasick", "regex"] bench_hash = ["ahash", "xxhash-rust", "blake3", "gxhash"] -# bench_sequence = ["arrow-rs", "rayon"] +bench_sequence = ["arrow", "rayon"] +bench_similarity = ["rapidfuzz", "bio"] [dependencies.memchr] version = "2.7.1" default-features = false optional = true +[dependencies.bio] +version = "2.2.0" +default-features = false +optional = true + +[dependencies.bstr] +version = "1.11.3" +default-features = false +optional = true + +[dependencies.aho-corasick] +version = "1.1.3" +optional = true + +[dependencies.regex] +version = "1.11.1" +optional = true + [dependencies.rapidfuzz] version = "0.5.0" optional = true @@ -42,6 +60,14 @@ version = "0.8" optional = true features = ["xxh3", "const_xxh3"] +[dependencies.arrow] +version = "54.2.1" +optional = true + +[dependencies.rayon] +version = "1.10.0" +optional = true + [[bench]] name = "bench_find" path = "bench_find.rs" @@ -49,10 +75,10 @@ harness = false required-features = ["bench_find"] [[bench]] -name = "bench_levenshtein" -path = "bench_levenshtein.rs" +name = "bench_similarity" +path = "bench_similarity.rs" harness = false -required-features = ["bench_levenshtein"] +required-features = ["bench_similarity"] [[bench]] name = "bench_hash" From bc15e9073cc66b6addc1c2756cc58e2807e7a6ac Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 19:58:17 +0000 Subject: [PATCH 18/29] Add: New byteset benchmarks --- bench_find.rs | 102 +++++++++++++++++++++++++------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 4c42392..7bfd7cd 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -63,7 +63,7 @@ fn log_stringzilla_metadata() { fn configure_bench() -> Criterion { Criterion::default() - .sample_size(1000) // Test this many needles. + .sample_size(10) // Each loop scans the whole dataset. .warm_up_time(Duration::from_secs(10)) // Let the CPU frequencies settle. .measurement_time(Duration::from_secs(120)) // Actual measurement time. } @@ -221,78 +221,77 @@ fn bench_byteset_forward( const BYTES_DIGITS: &[u8] = b"0123456789"; // Benchmark for StringZilla forward search using a cycle iterator. - let mut tokens = needles.iter().cycle(); let sz_tabs = Byteset::from(BYTES_TABS); let sz_html = Byteset::from(BYTES_HTML); let sz_digits = Byteset::from(BYTES_DIGITS); g.bench_function("sz::find_byteset", |b| { b.iter(|| { - let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); - let mut pos: usize = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_tabs) { - pos += found + 1; - } - pos = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_html) { - pos += found + 1; - } - pos = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_digits) { - pos += found + 1; + for token in needles.iter() { + let token_bytes = black_box(token.as_bytes()); + let mut pos: usize = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_tabs) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_html) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_digits) { + pos += found + 1; + } } }) }); // Benchmark for bstr's byteset search. - let mut tokens = needles.iter().cycle(); g.bench_function("bstr::iter", |b| { b.iter(|| { - let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); - let mut pos: usize = 0; - // Inline search for `BYTES_TABS`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_TABS.contains(&c)) - { - pos += found + 1; - } - pos = 0; - // Inline search for `BYTES_HTML`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_HTML.contains(&c)) - { - pos += found + 1; - } - pos = 0; - // Inline search for `BYTES_DIGITS`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_DIGITS.contains(&c)) - { - pos += found + 1; + for token in needles.iter() { + let token_bytes = black_box(token.as_bytes()); + let mut pos: usize = 0; + // Inline search for `BYTES_TABS`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_TABS.contains(&c)) + { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_HTML`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_HTML.contains(&c)) + { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_DIGITS`. + while let Some(found) = token_bytes[pos..] + .iter() + .position(|&c| BYTES_DIGITS.contains(&c)) + { + pos += found + 1; + } } }) }); // Benchmark for Regex-based byteset search. - let mut tokens = needles.iter().cycle(); let re_tabs = Regex::new("[\n\r\x0B\x0C]").unwrap(); let re_html = Regex::new("[&'\"=\\[\\]]").unwrap(); let re_digits = Regex::new("[0-9]").unwrap(); g.bench_function("regex::find_iter", |b| { b.iter(|| { - let token = black_box(*tokens.next().unwrap()); - black_box(re_tabs.find_iter(token.as_bytes()).count()); - black_box(re_html.find_iter(token.as_bytes()).count()); - black_box(re_digits.find_iter(token.as_bytes()).count()); + for token in needles.iter() { + black_box(re_tabs.find_iter(token.as_bytes()).count()); + black_box(re_html.find_iter(token.as_bytes()).count()); + black_box(re_digits.find_iter(token.as_bytes()).count()); + } }) }); // Benchmark for Aho–Corasick-based byteset search. - let mut tokens: std::iter::Cycle> = needles.iter().cycle(); let ac_tabs = AhoCorasick::new( &BYTES_TABS .iter() @@ -316,10 +315,11 @@ fn bench_byteset_forward( .expect("failed to create AhoCorasick FSA"); g.bench_function("aho_corasick::find_iter", |b| { b.iter(|| { - let token = black_box(*tokens.next().unwrap()); - black_box(ac_tabs.find_iter(token).count()); - black_box(ac_html.find_iter(token).count()); - black_box(ac_digits.find_iter(token).count()); + for token in needles.iter() { + black_box(ac_tabs.find_iter(token).count()); + black_box(ac_html.find_iter(token).count()); + black_box(ac_digits.find_iter(token).count()); + } }) }); } From 567d39cdd8809386eba20283b4c76bfd95ba1fc9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 23:36:11 +0000 Subject: [PATCH 19/29] Improve: Naming hash benchmarks --- bench_hash.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bench_hash.rs b/bench_hash.rs index 954c87b..03093ad 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -90,17 +90,17 @@ fn bench_hash(c: &mut Criterion) { // Calculate total bytes processed for throughput reporting. let total_bytes: usize = units.iter().map(|u| u.len()).sum(); - let mut g = c.benchmark_group("hash"); + let mut g = c.benchmark_group("stateless"); g.throughput(Throughput::Bytes(total_bytes as u64)); - perform_hashing_benchmarks(&mut g, &units); + stateless_benchmarks(&mut g, &units); g.finish(); } -fn perform_hashing_benchmarks( +fn stateless_benchmarks( group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, units: &[&str], ) { - // Benchmark: StringZilla bytesum + // Benchmark: StringZilla `bytesum` group.bench_function("stringzilla::bytesum", |b| { b.iter(|| { for unit in units { @@ -110,7 +110,7 @@ fn perform_hashing_benchmarks( }) }); - // Benchmark: StringZilla hash + // Benchmark: StringZilla `hash` group.bench_function("stringzilla::hash", |b| { b.iter(|| { for unit in units { @@ -119,8 +119,8 @@ fn perform_hashing_benchmarks( }) }); - // Benchmark: std::hash::BuildHasher (SipHash) - group.bench_function("std::hash::BuildHasher (SipHash)", |b| { + // Benchmark: SipHash via `std::hash::BuildHasher` + group.bench_function("std::hash::BuildHasher", |b| { let std_builder = std::collections::hash_map::RandomState::new(); b.iter(|| { for unit in units { @@ -131,8 +131,8 @@ fn perform_hashing_benchmarks( }) }); - // Benchmark: aHash (hash_one) - group.bench_function("aHash (hash_one)", |b| { + // Benchmark: aHash (`hash_one`) + group.bench_function("aHash::hash_one", |b| { let hash_builder = RandomState::with_seed(42); b.iter(|| { for unit in units { @@ -141,8 +141,8 @@ fn perform_hashing_benchmarks( }) }); - // Benchmark: xxHash (xxh3) - group.bench_function("xxh3", |b| { + // Benchmark: xxHash (`xxh3`) + group.bench_function("xxh3::xxh3_64", |b| { b.iter(|| { for unit in units { let _hash = black_box(xxh3_64(unit.as_bytes())); @@ -151,7 +151,7 @@ fn perform_hashing_benchmarks( }); // Benchmark: gxhash - group.bench_function("gxhash", |b| { + group.bench_function("gxhash::gxhash64", |b| { b.iter(|| { for unit in units { let _hash = black_box(gxhash::gxhash64(unit.as_bytes(), 42)); From 380ca6d321ab4e6dd12950ac78eca063644535bf Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 14 Mar 2025 23:36:31 +0000 Subject: [PATCH 20/29] Add: Sequence-sorting benchmarks --- Cargo.toml | 6 +++ bench_sequence.rs | 123 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 97 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8e1aa5f..f56e609 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -85,3 +85,9 @@ name = "bench_hash" path = "bench_hash.rs" harness = false required-features = ["bench_hash"] + +[[bench]] +name = "bench_sequence" +path = "bench_sequence.rs" +harness = false +required-features = ["bench_sequence"] diff --git a/bench_sequence.rs b/bench_sequence.rs index 79f45dd..ebdffee 100644 --- a/bench_sequence.rs +++ b/bench_sequence.rs @@ -1,11 +1,11 @@ -//! # StringWa.rs: String Sorting Benchmarks +//! # StringWa.rs: String Sequence Operations Benchmarks //! //! This file benchmarks various libraries for processing string-identifiable collections. //! Including sorting arrays of strings: //! -//! - `sz::sort` from the StringZilla library -//! - The standard library’s `sort_unstable` -//! - Rayon’s parallel sort (`par_sort_unstable`) +//! - StringZilla's `sz::argsort_permutation` +//! - The standard library's `sort_unstable` +//! - Rayon's parallel `par_sort_unstable` //! //! Intersecting string collections, similar to "STRICT INNER JOIN" in SQL databases. //! @@ -21,19 +21,48 @@ //! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: //! //! ```sh -//! STRINGWARS_TOKENS=lines STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_sort bench_sort --jobs 8 -//! STRINGWARS_TOKENS=words STRINGWARS_DATASET=README.md RUSTFLAGS="-C target-cpu=native" cargo criterion --features bench_sort bench_sort --jobs 8 +//! RUSTFLAGS="-C target-cpu=native" \ +//! RAYON_NUM_THREADS=1 \ +//! STRINGWARS_DATASET=README.md \ +//! STRINGWARS_TOKENS=lines \ +//! cargo criterion --features bench_sequence bench_sequence --jobs 8 //! ``` use std::env; use std::fs; +use std::sync::Arc; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; + +use arrow::array::{ArrayRef, StringArray, UInt32Array}; +use arrow::compute::{lexsort_to_indices, SortColumn, SortOptions}; +use arrow::error::Result; use rayon::prelude::*; +use stringzilla::sz::{ + argsort_permutation as sz_argsort_permutation, + argsort_permutation_by as sz_argsort_permutation_by, +}; + +use stringzilla::sz::{ + // Pull some metadata logging functionality + capabilities as sz_capabilities, + dynamic_dispatch as sz_dynamic_dispatch, + version as sz_version, +}; + +fn log_stringzilla_metadata() { + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); +} -// Import the specialized sort from StringZilla. It is assumed that `sz::sort` -// sorts a mutable slice of `String` in place. -use stringzilla::sz::sort as sz_sort; +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(10) // Each loop processes the whole dataset. + .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(10)) // Actual measurement time. +} fn load_data() -> Vec { let dataset_path = @@ -55,48 +84,78 @@ fn load_data() -> Vec { data } -fn bench_sort(c: &mut Criterion) { +fn bench_argsort(c: &mut Criterion) { // Load the dataset once; each benchmark iteration will clone this unsorted data. let unsorted = load_data(); - if unsorted.is_empty() { panic!("No data found in dataset for sorting benchmark."); } let mut group = c.benchmark_group("sorting"); + //? We have a very long benchmark, flat sampling is what we need. + //? https://bheisler.github.io/criterion.rs/book/user_guide/advanced_configuration.html#sampling-mode + group.sampling_mode(SamplingMode::Flat); + //? For comparison-based sorting algorithms, we can report throughput in terms of comparisons, + //? which is proportional to the number of elements in the array multiplied by the logarithm of + //? the number of elements. + let throughput = unsorted.len() as f64 * (unsorted.len() as f64).log2(); + group.throughput(criterion::Throughput::Elements(throughput as u64)); - // Benchmark: Specialized sort from StringZilla. - group.bench_function("sz::sort", |b| { + // Benchmark: StringZilla's argsort + group.bench_function("sz::argsort_permutation", |b| { b.iter(|| { - // Clone to ensure each sort works on an unsorted vector. - let mut data = unsorted.clone(); - // Perform the specialized sort. - sz_sort(black_box(&mut data)); - black_box(&data); + let mut indices: Vec = (0..unsorted.len()).collect(); + match sz_argsort_permutation(&unsorted, &mut indices) { + Ok(_) => black_box(&indices), + Err(e) => panic!("StringZilla argsort failed: {:?}", e), + } }) }); - // Benchmark: Standard library sort_unstable. - group.bench_function("std::sort_unstable", |b| { + // Benchmark: Apache Arrow's `lexsort_to_indices` + // https://arrow.apache.org/rust/arrow/compute/fn.lexsort.html + // https://arrow.apache.org/rust/arrow/compute/fn.lexsort_to_indices.html + let array = Arc::new(StringArray::from(unsorted.clone())) as ArrayRef; + group.bench_function("arrow::lexsort_to_indices", |b| { b.iter(|| { - let mut data = unsorted.clone(); - data.sort_unstable(); - black_box(&data); + let column_to_sort = SortColumn { + values: array.clone(), + options: Some(SortOptions { + descending: false, + nulls_first: true, + }), + }; + match lexsort_to_indices(&[column_to_sort], None) { + Ok(indices) => black_box(&indices), + Err(e) => panic!("Arrow lexsort failed: {:?}", e), + } }) }); - // Benchmark: Rayon parallel sort_unstable. - group.bench_function("rayon::par_sort_unstable", |b| { + // Benchmark: Standard library argsort using `sort_unstable_by_key` + group.bench_function("std::sort_unstable_by_key", |b| { b.iter(|| { - let mut data = unsorted.clone(); - // Parallel sort requires the `rayon` crate and the ParallelSliceMut trait. - data.par_sort_unstable(); - black_box(&data); + let mut indices: Vec = (0..unsorted.len()).collect(); + indices.sort_unstable_by_key(|&i| &unsorted[i]); + black_box(&indices); + }) + }); + + // Benchmark: Parallel argsort using Rayon + group.bench_function("rayon::par_sort_unstable_by_key", |b| { + b.iter(|| { + let mut indices: Vec = (0..unsorted.len()).collect(); + indices.par_sort_unstable_by_key(|&i| &unsorted[i]); + black_box(&indices); }) }); group.finish(); } -criterion_group!(benches, bench_sort); -criterion_main!(benches); +fn main() { + log_stringzilla_metadata(); + let mut criterion = configure_bench(); + bench_argsort(&mut criterion); + criterion.final_summary(); +} From a43ad4372350205a75ffca2e2d209426f589b273 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 15 Mar 2025 11:36:26 +0000 Subject: [PATCH 21/29] Add: Incremental hashing benchmarks --- bench_hash.rs | 92 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/bench_hash.rs b/bench_hash.rs index 03093ad..083838f 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -36,33 +36,20 @@ use std::env; use std::fs; -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use criterion::{black_box, Criterion, Throughput}; -use ahash::RandomState; +use ahash::{AHasher, RandomState}; use blake3; use gxhash; use std::hash::{BuildHasher, Hasher}; -use stringzilla::sz::{bytesum as sz_bytesum, hash as sz_hash}; +use stringzilla::sz; use xxhash_rust::xxh3::xxh3_64; -use stringzilla::sz::{ - // Pull some metadata logging functionality - capabilities as sz_capabilities, - dynamic_dispatch as sz_dynamic_dispatch, - version as sz_version, -}; - fn log_stringzilla_metadata() { - let v = sz_version(); + let v = sz::version(); println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); - println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); - println!("- capabilities: {}", sz_capabilities().as_str()); -} - -fn configure_bench() -> Criterion { - Criterion::default() - .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. - .measurement_time(std::time::Duration::from_secs(10)) // Actual measurement time. + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); } fn bench_hash(c: &mut Criterion) { @@ -90,6 +77,12 @@ fn bench_hash(c: &mut Criterion) { // Calculate total bytes processed for throughput reporting. let total_bytes: usize = units.iter().map(|u| u.len()).sum(); + + let mut g = c.benchmark_group("stateful"); + g.throughput(Throughput::Bytes(total_bytes as u64)); + stateful_benchmarks(&mut g, &units); + g.finish(); + let mut g = c.benchmark_group("stateless"); g.throughput(Throughput::Bytes(total_bytes as u64)); stateless_benchmarks(&mut g, &units); @@ -105,7 +98,7 @@ fn stateless_benchmarks( b.iter(|| { for unit in units { // Using black_box to prevent compiler optimizations. - let _hash = sz_bytesum(black_box(unit.as_bytes())); + let _hash = sz::bytesum(black_box(unit.as_bytes())); } }) }); @@ -114,7 +107,7 @@ fn stateless_benchmarks( group.bench_function("stringzilla::hash", |b| { b.iter(|| { for unit in units { - let _hash = sz_hash(black_box(unit.as_bytes())); + let _hash = sz::hash(black_box(unit.as_bytes())); } }) }); @@ -169,9 +162,64 @@ fn stateless_benchmarks( }); } +fn stateful_benchmarks( + group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + units: &[&str], +) { + // Benchmark: StringZilla `bytesum` + group.bench_function("stringzilla::bytesum", |b| { + b.iter(|| { + let mut aggregate = 0u64; + for unit in units { + aggregate += sz::bytesum(unit.as_bytes()); + } + black_box(aggregate); + }) + }); + + // Benchmark: StringZilla `hash` + group.bench_function("stringzilla::HashState", |b| { + b.iter(|| { + let mut aggregate = sz::HashState::new(0); + for unit in units { + aggregate.stream(unit.as_bytes()); + } + black_box(aggregate.fold()); + }) + }); + + // Benchmark: SipHash via `std::hash::BuildHasher` + group.bench_function("std::hash::BuildHasher", |b| { + let std_builder = std::collections::hash_map::RandomState::new(); + b.iter(|| { + let mut aggregate = std_builder.build_hasher(); + for unit in units { + aggregate.write(unit.as_bytes()); + } + black_box(aggregate.finish()); + }) + }); + + // Benchmark: aHash (`hash_one`) + group.bench_function("aHash::AHasher", |b| { + b.iter(|| { + let mut aggregate = AHasher::default(); + for unit in units { + aggregate.write(unit.as_bytes()); + } + black_box(aggregate.finish()); + }) + }); +} + fn main() { log_stringzilla_metadata(); - let mut criterion = Criterion::default().configure_from_args(); + let mut criterion = Criterion::default() + .configure_from_args() + .sample_size(10) // Number of samples to collect. + .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(10)); // Actual measurement time. + bench_hash(&mut criterion); criterion.final_summary(); } From 849fd99cbdc15f3cbfbbca85dd5ffc594f1ac4a5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 15 Mar 2025 11:46:13 +0000 Subject: [PATCH 22/29] Docs: New Intel Sapphire Rapids results --- .vscode/settings.json | 12 +++ README.md | 175 +++++++++++++++++++++++++----------------- 2 files changed, 118 insertions(+), 69 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 38ece84..e27dc57 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,14 +2,26 @@ "cSpell.words": [ "ahash", "Bioinformatics", + "bstr", + "byteset", + "bytesets", "bytesum", + "corasick", + "Dataframe", + "gxhash", + "lexsort", + "Melem", "memchr", "memmem", + "MergeSort", + "QuickSort", "Needleman", "rapidfuzz", "rfind", + "Skylake", "stringwars", "stringzilla", + "strstr", "tfidf", "Wunsch" ] diff --git a/README.md b/README.md index 9ab03aa..67dbcd3 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,10 @@ _Not to pick a fight, but let there be String Wars!_ 😅 Jokes aside, many __great__ libraries for string processing exist. -_Mostly, of course, written in C and C++, but some in Rust as well._ 😅 +_Mostly, of course, written in Assembly, C, and C++, but some in Rust as well._ 😅 -Where Rust decimates C and C++, however, is the __simplicity__ of dependency management, making it great for benchmarking low-level software! -So, to accelerate the development of the [`stringzilla`](https://github.com/ashvardanian/StringZilla) C library, I've created this repository to compare it against: +Where Rust decimates C and C++, however, is the __simplicity__ of dependency management, making it great for benchmarking "Systems Software"! +So, to accelerate the development of the [`StringZilla`](https://github.com/ashvardanian/StringZilla) C library, I've created this repository to compare it against some of my & communities most beloved Rust projects, like: - [`memchr`](https://github.com/BurntSushi/memchr) for substring search. - [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances. @@ -17,6 +17,7 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv Of course, the functionality of the projects is different, as are the APIs and the usage patterns. So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations. +Notably, I also favor modern hardware with support for a wider range SIMD instructions, like mask-equipped AVX-512 on x86 starting from the 2015 Intel Skylake-X CPUs or more recent predicated variable-length SVE and SVE2 on Arm, that aren't supported by most of the existing libraries and Rust tooling. ## String Hashing Benchmarks @@ -26,96 +27,132 @@ Many of them have similar pitfalls: - They are not always documented to have a certain reproducible output and are recommended for use only for local in-memory construction of hash tables, not for serialization or network communication. - They don't always support streaming and require the whole input to be available in memory at once. -- They rarely benefit from predicated SIMD instructions on modern hardware like AVX-512 on x86 or SVE on Arm. - They don't always pass the SMHasher test suite, especially with `--extra` checks enabled. +- They generally don't have a dynamic dispatch mechanism to simplify shipping of precompiled software to a wide range of users. StringZilla addresses those issues and seems to provide competitive performance. On Intel Sapphire Rapids CPU, on `xlsum.csv` dataset, the following numbers can be expected for hashing individual whitespace-delimited words and newline-delimited lines: -| Benchmark | Lines | Words | -| ---------------------- | ----------: | ---------: | -| `std::hash` (SipHash) | 3.74 GiB/s | 0.43 GiB/s | -| `stringzilla::bytesum` | 11.65 GiB/s | 2.16 GiB/s | -| `stringzilla::hash` | 11.23 GiB/s | 1.84 GiB/s | -| `aHash::hash_one` | 8.61 GiB/s | 1.23 GiB/s | -| `xxh3` | 9.48 GiB/s | 1.08 GiB/s | -| `blake3` | 1.97 GiB/s | N/A GiB/s | -| `gxhash` | 10.81 GiB/s | N/A GiB/s | +| Library | Shorter Words | Longer Lines | +| ---------------------- | -------------: | --------------: | +| `std::hash` | 0.43 GiB/s | 3.74 GiB/s | +| `xxh3::xxh3_64` | 1.08 GiB/s | 9.48 GiB/s | +| `aHash::hash_one` | 1.23 GiB/s | 8.61 GiB/s | +| `gxhash::gxhash64` | __2.68 GiB/s__ | 10.81 GiB/s | +| `stringzilla::hash` | 1.84 GiB/s | __11.23 GiB/s__ | +| | | | +| `blake3::hash` | 0.10 GiB/s | 1.97 GiB/s | +| `stringzilla::bytesum` | 2.16 GiB/s | 11.65 GiB/s | +> Blake3 and byte-level summation are provided as a reference for expected lower and upper bounds. +> Blake3 is a cryptographic hash function and is obliged to provide a certain level of security, which comes at a cost. +> Byte-level summation is a simple operation, that is still sometimes used in practice, and is expected to be the fastest. -## Substring Search Benchmarks +In larger systems, however, we often need the ability to incrementally hash the data. +This is especially important in distributed systems, where the data is too large to fit into memory at once. + +| Library | Shorter Words | Longer Lines | +| -------------------------- | -------------: | -------------: | +| `std::hash::DefaultHasher` | 0.51 GiB/s | 3.92 GiB/s | +| `aHash::AHasher` | __1.30 GiB/s__ | __8.56 GiB/s__ | +| `stringzilla::HashState` | 0.89 GiB/s | 6.39 GiB/s | + +## Substring & Character-Set Search Benchmarks Substring search is one of the most common operations in text processing, and one of the slowest. -StringZilla was designed to supersede LibC and implement those core operations in CPU-friendly manner, using branchless operations, SWAR, and SIMD assembly instructions. -Notably, Rust has a `memchr` crate that provides a similar functionality, and it's used in many popular libraries. -This repository provides basic benchmarking scripts for comparing the throughput of [`stringzilla`](https://github.com/ashvardanian/StringZilla) and [`memchr`](https://github.com/BurntSushi/memchr). -For normal order and reverse order search, over ASCII and UTF8 input data, the following numbers can be expected. - -| | ASCII ⏩ | ASCII ⏪ | UTF8 ⏩ | UTF8 ⏪ | -| ------------- | --------------: | --------------: | -------------: | --------------: | -| Intel: | | | | | -| `memchr` | 5.89 GB/s | 1.08 GB/s | 8.73 GB/s | 3.35 GB/s | -| `stringzilla` | __8.37__ GB/s | __8.21__ GB/s | __11.21__ GB/s | __11.20__ GB/s | -| Arm: | | | | | -| `memchr` | 6.38 GB/s | 1.12 GB/s | __13.20__ GB/s | 3.56 GB/s | -| `stringzilla` | __6.56__ GB/s | __5.56__ GB/s | 9.41 GB/s | __8.17__ GB/s | -| | | | | | -| Average | __1.2x__ faster | __6.2x__ faster | - | __2.8x__ faster | - - -> For Intel the benchmark was run on AWS `r7iz` instances with Sapphire Rapids cores. -> For Arm the benchmark was run on AWS `r7g` instances with Graviton 3 cores. -> The ⏩ signifies forward search, and ⏪ signifies reverse order search. -> At the time of writing, the latest versions of `memchr` and `stringzilla` were used - 2.7.1 and 3.3.0, respectively. +Most of the time, programmers don't think about replacing the `str::find` method, as it's already expected to be optimized. +In many languages it's offloaded to the C standard library [`memmem`](https://man7.org/linux/man-pages/man3/memmem.3.html) or [`strstr`](https://en.cppreference.com/w/c/string/byte/strstr) for NULL-terminated strings. +The C standard library is, however, also implemented by humans, and a better solution can be created. -## Replicating the Results +| Library | Shorter Words | Longer Lines | +| -------------------- | --------------: | --------------: | +| `std::str::find` | 9.48 GiB/s | 10.88 GiB/s | +| `memmem::find` | 9.51 GiB/s | 10.83 GiB/s | +| `stringzilla::find` | __10.45 GiB/s__ | __10.89 GiB/s__ | +| | | | +| `std::str::rfind` | 2.96 GiB/s | 3.65 GiB/s | +| `memmem::rfind` | 2.95 GiB/s | 3.71 GiB/s | +| `stringzilla::rfind` | __9.78 GiB/s__ | __10.43 GiB/s__ | -Before running benchmarks, you can test your Rust environment running: +> Higher-throughput evaluation with `memmem` is possible, if the "matcher" object is reused to iterate through the string instead of constructing a new one for each search. -```bash -cargo install cargo-criterion --locked -``` +Similarly, one can search a string for a set of characters. +StringWa.rs takes a few representative examples of various character sets that appear in real parsing or string validation tasks: -Each benchmark includes a warm-up, to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. -To run them on Linux and MacOS, pass the dataset path as an environment variable: +- tabulation characters, like `\n\r\v\f`; +- HTML and XML markup characters, like `&'\"=[]`; +- numeric characters, like `0123456789`. -- Substring Search: +It's common in such cases, to pre-construct some library-specific filter-object or Finite State Machine (FSM) to search for a set of characters. +Once that object is constructed, all of it's inclusions in each token (word or line) are counted. +Current numbers should look like this: - ```bash - STRINGWARS_DATASET=README.md cargo criterion --features bench_find bench_find --jobs 8 - ``` +| Library | Shorter Words | Longer Lines | +| --------------------------- | -------------: | -------------: | +| `bstr::iter` | 0.26 GiB/s | 0.25 GiB/s | +| `regex::find_iter` | 0.23 GiB/s | 5.22 GiB/s | +| `aho_corasick::find_iter` | 0.41 GiB/s | 0.50 GiB/s | +| `stringzilla::find_byteset` | __1.61 GiB/s__ | __8.17 GiB/s__ | - As part of the benchmark, the input "haystack" file is whitespace-tokenized into an array of strings. - In every benchmark iteration, a new "needle" is taken from that array of tokens. - All inclusions of that token in the haystack are counted, and the throughput is calculated. +## Strings Sorting & Intersections Benchmarks -- Edit Distance: +Rust has several Dataframe libraries, DBMS and Search engines that heavily rely on string sorting and intersections. +Those operations mostly are implemented using conventional algorithms: - ```bash - STRINGWARS_TOKENS=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 - STRINGWARS_TOKENS=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8 - ``` +- Comparison-based Quicksort or Mergesort for sorting. +- Hash-based or Tree-based algorithms for intersections. - Edit distance benchmarks compute the Levenshtein distance between consecutive pairs of whitespace-delimited words or newline-delimited lines. - They include byte-level and character-level operations and also run for the bounded case - when the maximum allowed distance is predefined. - By default, the maximum allowed distance is set to 15% of the longer string in each pair. +Assuming the comparisons can be accelerated with SIMD and so can be the hash functions, StringZilla could already provide a performance boost in such applications, but starting with v4 it also provides specialized algorithms for sorting and intersections. +Those are directly compatible with arbitrary string-comparable collection types with a support of an indexed access to the elements. -- Hashing: +| Library | Shorter Words | Longer Lines | +| ------------------------------------------- | -----------------: | ----------------: | +| `std::sort_unstable_by_key` | 54.35 Melem/s | 57.70 Melem/s | +| `arrow::lexsort_to_indices` | ❌ | ❌ | +| `rayon::par_sort_unstable_by_key` on 1 vCPU | ? | 50.35 Melem/s | +| `stringzilla::argsort_permutation` | __182.88 Melem/s__ | __74.64 Melem/s__ | - ```bash - STRINGWARS_TOKENS=file STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 - STRINGWARS_TOKENS=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 - STRINGWARS_TOKENS=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8 - ``` +## Random Generation & Lookup Tables -- Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf): +Some of the most common operations in data processing are random generation and lookup tables. +That's true not only for strings but for any data type, and StringZilla has been extensively used in Image Processing and Bioinformatics for those purposes. - ```bash - STRINGWARS_DATASET=README.md cargo criterion --features bench_tfidf bench_tfidf --jobs 8 - ``` +## String Edit Distance Benchmarks - The TF-IDF benchmarks compute the term frequency-inverse document frequency for each word in the input file. - The benchmark relies on a hybrid of StringZilla and SimSIMD to achieve the best performance. +Edit Distance calculation is a common component of Search Engines, Data Cleaning, and Natural Language Processing, as well as in Bioinformatics. +It's a computationally expensive operation, generally implemented using dynamic programming, with a quadratic time complexity upper bound. + +## Replicating the Results + +Before running benchmarks, you can test your Rust environment running: + +```bash +cargo install cargo-criterion --locked +``` + +Wars always take long, and so do these benchmarks. +Every one of them includes a few seconds of a warm-up phase to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. +Each of them accepts a few environment variables to control the dataset, the tokenization, and the error bounds. +You can log those by printing file-level documentation using `awk` on Linux: + +```bash +awk '/^\/\/!/ { print } !/^\/\/!/ { exit }' bench_find.rs +``` + +Commonly used environment variables are: + +- `STRINGWARS_DATASET` - the path to the textual dataset file. +- `STRINGWARS_TOKENS` - the tokenization mode: `file`, `lines`, or `words`. +- `STRINGWARS_ERROR_BOUND` - the maximum allowed error in the Levenshtein distance. + +Here is an example of a common benchmark run on a Unix-like system: + +```bash +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_hash bench_hash --jobs 8 +``` On Windows using PowerShell you'd need to set the environment variable differently: From 1e246db293609698311e28a7f82085c9276dae4c Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 15 Mar 2025 20:40:15 +0000 Subject: [PATCH 23/29] Fix: `arrow::LargeStringArray` to avoid overflow --- bench_sequence.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bench_sequence.rs b/bench_sequence.rs index ebdffee..cb658a6 100644 --- a/bench_sequence.rs +++ b/bench_sequence.rs @@ -32,11 +32,10 @@ use std::env; use std::fs; use std::sync::Arc; -use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; +use criterion::{black_box, Criterion, SamplingMode}; -use arrow::array::{ArrayRef, StringArray, UInt32Array}; +use arrow::array::{ArrayRef, LargeStringArray}; use arrow::compute::{lexsort_to_indices, SortColumn, SortOptions}; -use arrow::error::Result; use rayon::prelude::*; use stringzilla::sz::{ argsort_permutation as sz_argsort_permutation, @@ -105,17 +104,17 @@ fn bench_argsort(c: &mut Criterion) { group.bench_function("sz::argsort_permutation", |b| { b.iter(|| { let mut indices: Vec = (0..unsorted.len()).collect(); - match sz_argsort_permutation(&unsorted, &mut indices) { - Ok(_) => black_box(&indices), - Err(e) => panic!("StringZilla argsort failed: {:?}", e), - } + sz_argsort_permutation(&unsorted, &mut indices).expect("StringZilla argsort failed"); + black_box(indices); }) }); // Benchmark: Apache Arrow's `lexsort_to_indices` // https://arrow.apache.org/rust/arrow/compute/fn.lexsort.html // https://arrow.apache.org/rust/arrow/compute/fn.lexsort_to_indices.html - let array = Arc::new(StringArray::from(unsorted.clone())) as ArrayRef; + // ! We can't use the conventional `StringArray` in most of our workloads, as it will + // ! overflow the 32-bit tape offset capacity and panic. + let array = Arc::new(LargeStringArray::from(unsorted.clone())) as ArrayRef; group.bench_function("arrow::lexsort_to_indices", |b| { b.iter(|| { let column_to_sort = SortColumn { @@ -126,7 +125,7 @@ fn bench_argsort(c: &mut Criterion) { }), }; match lexsort_to_indices(&[column_to_sort], None) { - Ok(indices) => black_box(&indices), + Ok(indices) => black_box(indices), Err(e) => panic!("Arrow lexsort failed: {:?}", e), } }) From 8aff49f7cb8c5b2cd298a6f0d5e57be302501217 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 16 Mar 2025 09:19:00 +0000 Subject: [PATCH 24/29] Docs: Formatting --- README.md | 6 ++-- bench_find.rs | 66 ++++++++++++++++++++-------------------- bench_hash.rs | 74 +++++++++++++++++++++++---------------------- bench_sequence.rs | 59 ++++++++++++++++++------------------ bench_similarity.rs | 60 ++++++++++++++++++------------------ 5 files changed, 135 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index 67dbcd3..9600fb5 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,9 @@ Those are directly compatible with arbitrary string-comparable collection types | Library | Shorter Words | Longer Lines | | ------------------------------------------- | -----------------: | ----------------: | | `std::sort_unstable_by_key` | 54.35 Melem/s | 57.70 Melem/s | -| `arrow::lexsort_to_indices` | ❌ | ❌ | -| `rayon::par_sort_unstable_by_key` on 1 vCPU | ? | 50.35 Melem/s | -| `stringzilla::argsort_permutation` | __182.88 Melem/s__ | __74.64 Melem/s__ | +| `rayon::par_sort_unstable_by_key` on 1 vCPU | 47.08 Melem/s | 50.35 Melem/s | +| `arrow::lexsort_to_indices` | 122.20 Melem/s | __84.73 Melem/s__ | +| `stringzilla::argsort_permutation` | __182.88 Melem/s__ | 74.64 Melem/s | ## Random Generation & Lookup Tables diff --git a/bench_find.rs b/bench_find.rs index 7bfd7cd..28e3c4f 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -1,35 +1,37 @@ -//! # StringWa.rs: Substring Search Benchmarks -//! -//! This file benchmarks the forward and backward exact substring search functionality provided by -//! the StringZilla library and the memchr crate. The input file is treated as a haystack and all -//! of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for -//! any sampled token - all of its inclusions in a string are located. -//! -//! The input file is treated as a haystack and all of its tokens as needles. For substring searches, -//! each occurrence is located. For byteset searches, three separate operations are performed per token, -//! looking for: -//! -//! - any of "\n\r\v\f" - the 4 tabulation characters -//! - any of "&'\"=[]" - the 9 HTML-related characters -//! - any of "0123456789" - the 10 numeric characters -//! -//! ## Usage Examples -//! -//! The benchmarks use two environment variables to control the input dataset and mode: -//! -//! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: -//! - `lines`: Process the dataset line by line. -//! - `words`: Process the dataset word by word. -//! -//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: -//! -//! ```sh -//! RUSTFLAGS="-C target-cpu=native" \ -//! STRINGWARS_DATASET=README.md \ -//! STRINGWARS_TOKENS=lines \ -//! cargo criterion --features bench_find bench_find --jobs 8 -//! ``` +#![doc = r#" +# StringWa.rs: Substring Search Benchmarks + +This file benchmarks the forward and backward exact substring search functionality provided by +the StringZilla library and the memchr crate. The input file is treated as a haystack and all +of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for +any sampled token - all of its inclusions in a string are located. + +The input file is treated as a haystack and all of its tokens as needles. For substring searches, +each occurrence is located. For byteset searches, three separate operations are performed per token, +looking for: + +- any of "\n\r\v\f" - the 4 tabulation characters +- any of "&'\"=[]" - the 9 HTML-related characters +- any of "0123456789" - the 10 numeric characters + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_find bench_find --jobs 8 +``` +"#] use std::env; use std::fs; use std::time::Duration; diff --git a/bench_hash.rs b/bench_hash.rs index 083838f..4efb298 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -1,38 +1,40 @@ -//! # StringWa.rs: String Hashing Benchmarks -//! -//! This file contains benchmarks for various Rust hashing libraries using Criterion. -//! -//! The benchmarks compare the performance of different hash functions including: -//! -//! - Standard `Hash` implementation -//! - StringZilla (`bytesum`, `hash`, and incremental `hash` function variants) -//! - aHash (both incremental and single-entry variants) -//! - xxHash (xxh3) through the third-party `xxhash-rust` crate -//! - gxhash (gxhash64) -//! - Blake3 (the only cryptographic hash in the comparison, for reference) -//! -//! ## Usage Examples -//! -//! The benchmarks use two environment variables to control the input dataset and mode: -//! -//! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: -//! - `lines`: Process the dataset line by line. -//! - `words`: Process the dataset word by word. -//! - `file`: Process the entire file as a single unit. -//! -//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: -//! -//! ```sh -//! RUSTFLAGS="-C target-cpu=native" \ -//! STRINGWARS_DATASET=README.md \ -//! STRINGWARS_TOKENS=lines \ -//! cargo criterion --features bench_hash bench_hash --jobs 8 -//! ``` -//! -//! ## Notes -//! -//! - Ensure your CPU supports the required AES and SSE2 instructions when using `gxhash`. +#![doc = r#" +# StringWa.rs: String Hashing Benchmarks + +This file contains benchmarks for various Rust hashing libraries using Criterion. + +The benchmarks compare the performance of different hash functions including: + +- Standard `Hash` implementation +- StringZilla (`bytesum`, `hash`, and incremental `hash` function variants) +- aHash (both incremental and single-entry variants) +- xxHash (xxh3) through the third-party `xxhash-rust` crate +- gxhash (gxhash64) +- Blake3 (the only cryptographic hash in the comparison, for reference) + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + - `file`: Process the entire file as a single unit. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_hash bench_hash --jobs 8 +``` + +## Notes + +- Ensure your CPU supports the required AES and SSE2 instructions when using `gxhash`. +"#] use std::env; use std::fs; @@ -216,7 +218,7 @@ fn main() { log_stringzilla_metadata(); let mut criterion = Criterion::default() .configure_from_args() - .sample_size(10) // Number of samples to collect. + .sample_size(30) // Number of samples to collect. .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. .measurement_time(std::time::Duration::from_secs(10)); // Actual measurement time. diff --git a/bench_sequence.rs b/bench_sequence.rs index cb658a6..9f1888b 100644 --- a/bench_sequence.rs +++ b/bench_sequence.rs @@ -1,32 +1,33 @@ -//! # StringWa.rs: String Sequence Operations Benchmarks -//! -//! This file benchmarks various libraries for processing string-identifiable collections. -//! Including sorting arrays of strings: -//! -//! - StringZilla's `sz::argsort_permutation` -//! - The standard library's `sort_unstable` -//! - Rayon's parallel `par_sort_unstable` -//! -//! Intersecting string collections, similar to "STRICT INNER JOIN" in SQL databases. -//! -//! ## Usage Example -//! -//! The benchmarks use two environment variables to control the input dataset and mode: -//! -//! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: -//! - `lines`: Process the dataset line by line. -//! - `words`: Process the dataset word by word. -//! -//! To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: -//! -//! ```sh -//! RUSTFLAGS="-C target-cpu=native" \ -//! RAYON_NUM_THREADS=1 \ -//! STRINGWARS_DATASET=README.md \ -//! STRINGWARS_TOKENS=lines \ -//! cargo criterion --features bench_sequence bench_sequence --jobs 8 -//! ``` +#![doc = r#"# StringWa.rs: String Sequence Operations Benchmarks + +This file benchmarks various libraries for processing string-identifiable collections. +Including sorting arrays of strings: + +- StringZilla's `sz::argsort_permutation` +- The standard library's `sort_unstable` +- Rayon's parallel `par_sort_unstable` + +Intersecting string collections, similar to "STRICT INNER JOIN" in SQL databases. + +## Usage Example + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + RAYON_NUM_THREADS=1 \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_sequence bench_sequence --jobs 8 +``` +"#] use std::env; use std::fs; diff --git a/bench_similarity.rs b/bench_similarity.rs index 8a6c691..ba58149 100644 --- a/bench_similarity.rs +++ b/bench_similarity.rs @@ -1,36 +1,36 @@ -//! # StringWa.rs: String Similarity Benchmarks -//! -//! This file benchmarks different libraries implementing string alignment and edit -//! distance calculation, for both generic Levenshtein distances and the weighted -//! Needleman-Wunsch alignment scores used in Bioinformatics. -//! -//! The input file is tokenized into lines or words and each consecutive pair of tokens -//! is evaluated for similarity. As most algorithms have quadratic complexity and use -//! Dynamic Programming techniques, their throughput is evaluate in the number of CUPS, -//! or Cell Updates Per Second. -//! -//! ## Usage Examples -//! -//! The benchmarks use two environment variables to control the input dataset and mode: -//! -//! - `STRINGWARS_DATASET`: Path to the input dataset file. -//! - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: -//! - `lines`: Process the dataset line by line. -//! - `words`: Process the dataset word by word. -//! - `STRINGWARS_ERROR_BOUND`: Maximum error bound, defined as an integer percent. -//! -//! ```sh -//! RUSTFLAGS="-C target-cpu=native" \ -//! STRINGWARS_DATASET=README.md \ -//! STRINGWARS_ERROR_BOUND=15 \ -//! STRINGWARS_TOKENS=lines \ -//! cargo criterion --features bench_similarity bench_similarity --jobs 8 -//! ``` -//! +#![doc = r#"# StringWa.rs: String Similarity Benchmarks + +This file benchmarks different libraries implementing string alignment and edit +distance calculation, for both generic Levenshtein distances and the weighted +Needleman-Wunsch alignment scores used in Bioinformatics. + +The input file is tokenized into lines or words and each consecutive pair of tokens +is evaluated for similarity. As most algorithms have quadratic complexity and use +Dynamic Programming techniques, their throughput is evaluate in the number of CUPS, +or Cell Updates Per Second. + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. +- `STRINGWARS_ERROR_BOUND`: Maximum error bound, defined as an integer percent. + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_ERROR_BOUND=15 \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_similarity bench_similarity --jobs 8 +``` +"#] use std::env; use std::fs; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::Criterion; use rapidfuzz::distance::levenshtein; use stringzilla::sz::{ From 05b56bf694cb91091432d59dee07d87df9185452 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 24 Mar 2025 15:17:15 +0000 Subject: [PATCH 25/29] Improve: Switch to binary strings --- bench_find.rs | 189 +++++++++++++++++++++++--------------------------- 1 file changed, 88 insertions(+), 101 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 28e3c4f..bfd8894 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -1,10 +1,11 @@ #![doc = r#" -# StringWa.rs: Substring Search Benchmarks +# StringWa.rs: Substring & Character-Set Search Benchmarks This file benchmarks the forward and backward exact substring search functionality provided by the StringZilla library and the memchr crate. The input file is treated as a haystack and all of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for any sampled token - all of its inclusions in a string are located. +Be warned, for large files, it may take a while! The input file is treated as a haystack and all of its tokens as needles. For substring searches, each occurrence is located. For byteset searches, three separate operations are performed per token, @@ -33,6 +34,7 @@ RUSTFLAGS="-C target-cpu=native" \ ``` "#] use std::env; +use std::error::Error; use std::fs; use std::time::Duration; @@ -42,91 +44,51 @@ use aho_corasick::AhoCorasick; use bstr::ByteSlice; use memchr::memmem; use regex::bytes::Regex; -use stringzilla::sz::{ - find as sz_find, - find_byteset as sz_find_byteset, // - rfind as sz_rfind, - Byteset, -}; - -use stringzilla::sz::{ - // Pull some metadata logging functionality - capabilities as sz_capabilities, - dynamic_dispatch as sz_dynamic_dispatch, - version as sz_version, -}; - -fn log_stringzilla_metadata() { - let v = sz_version(); - println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); - println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); - println!("- capabilities: {}", sz_capabilities().as_str()); -} - -fn configure_bench() -> Criterion { - Criterion::default() - .sample_size(10) // Each loop scans the whole dataset. - .warm_up_time(Duration::from_secs(10)) // Let the CPU frequencies settle. - .measurement_time(Duration::from_secs(120)) // Actual measurement time. +use stringzilla::sz; + +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) } -fn bench_find(c: &mut Criterion) { - // Get the haystack path from the environment variable. - let dataset_path = - env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a [u8]) -> Result, Box> { let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); - let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack"); - - // Tokenize the haystack content by white space or lines. - let needles: Vec<&str> = match mode.as_str() { - "lines" => haystack_content.lines().collect(), - "words" => haystack_content.split_whitespace().collect(), - other => panic!( - "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", - other - ), + let tokens = match mode.as_str() { + "lines" => haystack.split(|&c| c == b'\n').collect(), + "words" => haystack.split(|&c| c == b'\n' || c == b' ').collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) + } }; - - if needles.is_empty() { - panic!("No tokens found in the haystack."); - } - - let haystack = haystack_content.as_bytes(); - let haystack_length = haystack.len(); - - // Benchmarks for forward search - let mut g = c.benchmark_group("substring-forward"); - g.throughput(Throughput::Bytes(haystack_length as u64)); - bench_substring_forward(&mut g, &needles, haystack); - g.finish(); - - // Benchmarks for backward search - let mut g = c.benchmark_group("substring-backward"); - g.throughput(Throughput::Bytes(haystack_length as u64)); - bench_substring_backward(&mut g, &needles, haystack); - g.finish(); - - // Benchmarks for byteset search - let mut g = c.benchmark_group("byteset-forward"); - g.throughput(Throughput::Bytes(3 * haystack_length as u64)); - bench_byteset_forward(&mut g, &needles); - g.finish(); + Ok(tokens) } +/// Benchmarks forward substring search using "StringZilla", "MemMem", and standard strings. fn bench_substring_forward( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - needles: &[&str], haystack: &[u8], + needles: &[&[u8]], ) { + g.throughput(Throughput::Bytes(haystack.len() as u64)); + // Benchmark for StringZilla forward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("sz::find", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; - while let Some(found) = sz_find(&haystack[pos..], token_bytes) { - pos += found + token_bytes.len(); + while let Some(found) = sz::find(&haystack[pos..], token) { + pos += found + token.len(); } }) }); @@ -136,10 +98,9 @@ fn bench_substring_forward( g.bench_function("memmem::find", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; - while let Some(found) = memmem::find(&haystack[pos..], token_bytes) { - pos += found + token_bytes.len(); + while let Some(found) = memmem::find(&haystack[pos..], token) { + pos += found + token.len(); } }) }); @@ -157,20 +118,22 @@ fn bench_substring_forward( }); } +/// Benchmarks backward substring search using "StringZilla", "MemMem", and standard strings. fn bench_substring_backward( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - needles: &[&str], haystack: &[u8], + needles: &[&[u8]], ) { + g.throughput(Throughput::Bytes(haystack.len() as u64)); + // Benchmark for StringZilla backward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("sz::rfind", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { - if let Some(found) = sz_rfind(&haystack[..end], token_bytes) { + if let Some(found) = sz::rfind(&haystack[..end], token) { pos = Some(found); } else { break; @@ -184,10 +147,9 @@ fn bench_substring_backward( g.bench_function("memmem::rfind", |b| { b.iter(|| { let token = black_box(*tokens.next().unwrap()); - let token_bytes = black_box(token.as_bytes()); let mut pos: Option = Some(haystack.len()); while let Some(end) = pos { - if let Some(found) = memmem::rfind(&haystack[..end], token_bytes) { + if let Some(found) = memmem::rfind(&haystack[..end], token) { pos = Some(found); } else { break; @@ -213,33 +175,36 @@ fn bench_substring_backward( }); } +/// Benchmarks byteset search using "StringZilla", "bstr", "RegEx", and "AhoCorasick" fn bench_byteset_forward( g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - needles: &[&str], + haystack: &[u8], + needles: &[&[u8]], ) { + g.throughput(Throughput::Bytes(3 * haystack.len() as u64)); + // Define the three bytesets we will analyze. const BYTES_TABS: &[u8] = b"\n\r\x0B\x0C"; const BYTES_HTML: &[u8] = b"&'\"=[]"; const BYTES_DIGITS: &[u8] = b"0123456789"; // Benchmark for StringZilla forward search using a cycle iterator. - let sz_tabs = Byteset::from(BYTES_TABS); - let sz_html = Byteset::from(BYTES_HTML); - let sz_digits = Byteset::from(BYTES_DIGITS); + let sz_tabs = sz::Byteset::from(BYTES_TABS); + let sz_html = sz::Byteset::from(BYTES_HTML); + let sz_digits = sz::Byteset::from(BYTES_DIGITS); g.bench_function("sz::find_byteset", |b| { b.iter(|| { for token in needles.iter() { - let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_tabs) { + while let Some(found) = sz::find_byteset(&token[pos..], sz_tabs) { pos += found + 1; } pos = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_html) { + while let Some(found) = sz::find_byteset(&token[pos..], sz_html) { pos += found + 1; } pos = 0; - while let Some(found) = sz_find_byteset(&token_bytes[pos..], sz_digits) { + while let Some(found) = sz::find_byteset(&token[pos..], sz_digits) { pos += found + 1; } } @@ -250,28 +215,19 @@ fn bench_byteset_forward( g.bench_function("bstr::iter", |b| { b.iter(|| { for token in needles.iter() { - let token_bytes = black_box(token.as_bytes()); let mut pos: usize = 0; // Inline search for `BYTES_TABS`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_TABS.contains(&c)) - { + while let Some(found) = token[pos..].iter().position(|&c| BYTES_TABS.contains(&c)) { pos += found + 1; } pos = 0; // Inline search for `BYTES_HTML`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_HTML.contains(&c)) - { + while let Some(found) = token[pos..].iter().position(|&c| BYTES_HTML.contains(&c)) { pos += found + 1; } pos = 0; // Inline search for `BYTES_DIGITS`. - while let Some(found) = token_bytes[pos..] - .iter() - .position(|&c| BYTES_DIGITS.contains(&c)) + while let Some(found) = token[pos..].iter().position(|&c| BYTES_DIGITS.contains(&c)) { pos += found + 1; } @@ -327,8 +283,39 @@ fn bench_byteset_forward( } fn main() { - log_stringzilla_metadata(); - let mut criterion = configure_bench(); - bench_find(&mut criterion); + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let haystack = load_dataset().unwrap(); + let needles = tokenize(&haystack).unwrap(); + if needles.is_empty() { + panic!("No tokens found in the dataset."); + } + + // Setup the default durations + let mut criterion = Criterion::default() + .sample_size(10) // Each loop scans the whole dataset, but this can't be under 10 + .warm_up_time(Duration::from_secs(1)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(10)); // Actual measurement time. + + // Benchmarks for forward search + let mut group = criterion.benchmark_group("substring-forward"); + bench_substring_forward(&mut group, &haystack, &needles); + group.finish(); + + // Benchmarks for backward search + let mut group = criterion.benchmark_group("substring-backward"); + bench_substring_backward(&mut group, &haystack, &needles); + group.finish(); + + // Benchmarks for byteset search + let mut group = criterion.benchmark_group("byteset-forward"); + bench_byteset_forward(&mut group, &haystack, &needles); + group.finish(); + criterion.final_summary(); } From b42ae25417f2e41bab0e251eed693c536758dc89 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 24 Mar 2025 19:07:30 +0000 Subject: [PATCH 26/29] Fix: Skip empty tokens --- bench_find.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index bfd8894..9c62d44 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -59,8 +59,14 @@ pub fn load_dataset() -> Result, Box> { pub fn tokenize<'a>(haystack: &'a [u8]) -> Result, Box> { let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); let tokens = match mode.as_str() { - "lines" => haystack.split(|&c| c == b'\n').collect(), - "words" => haystack.split(|&c| c == b'\n' || c == b' ').collect(), + "lines" => haystack + .split(|&c| c == b'\n') + .filter(|token| !token.is_empty()) + .collect(), + "words" => haystack + .split(|&c| c == b'\n' || c == b' ') + .filter(|token| !token.is_empty()) + .collect(), "file" => vec![haystack], other => { return Err(format!( From 848da95e9f189f46deff6b0411b9214ed604431e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 24 Mar 2025 19:07:48 +0000 Subject: [PATCH 27/29] Add: `memmem` iterators --- bench_find.rs | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/bench_find.rs b/bench_find.rs index 9c62d44..23f2ce0 100644 --- a/bench_find.rs +++ b/bench_find.rs @@ -99,7 +99,7 @@ fn bench_substring_forward( }) }); - // Benchmark for `memmem` forward search using a cycle iterator. + // Benchmark for `memmem::find` forward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("memmem::find", |b| { b.iter(|| { @@ -111,7 +111,7 @@ fn bench_substring_forward( }) }); - // Benchmark for default `std::str` forward search. + // Benchmark for default `std::str::find` forward search. let mut tokens = needles.iter().cycle(); g.bench_function("std::str::find", |b| { b.iter(|| { @@ -122,6 +122,17 @@ fn bench_substring_forward( } }) }); + + // Benchmark for `memmem::find_iter` forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::find_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + for match_ in memmem::find_iter(haystack, token) { + black_box(match_); + } + }) + }); } /// Benchmarks backward substring search using "StringZilla", "MemMem", and standard strings. @@ -148,7 +159,7 @@ fn bench_substring_backward( }) }); - // Benchmark for memmem backward search using a cycle iterator. + // Benchmark for `memmem::rfind` backward search using a cycle iterator. let mut tokens = needles.iter().cycle(); g.bench_function("memmem::rfind", |b| { b.iter(|| { @@ -164,7 +175,7 @@ fn bench_substring_backward( }) }); - // Benchmark for default `std::str` backward search. + // Benchmark for default `std::str::rfind` backward search. let mut tokens = needles.iter().cycle(); g.bench_function("std::str::rfind", |b| { b.iter(|| { @@ -179,6 +190,17 @@ fn bench_substring_backward( } }) }); + + // Benchmark for `memmem::rfind_iter` forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::rfind_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + for match_ in memmem::rfind_iter(haystack, token) { + black_box(match_); + } + }) + }); } /// Benchmarks byteset search using "StringZilla", "bstr", "RegEx", and "AhoCorasick" @@ -305,8 +327,8 @@ fn main() { // Setup the default durations let mut criterion = Criterion::default() .sample_size(10) // Each loop scans the whole dataset, but this can't be under 10 - .warm_up_time(Duration::from_secs(1)) // Let the CPU frequencies settle. - .measurement_time(Duration::from_secs(10)); // Actual measurement time. + .warm_up_time(Duration::from_secs(3)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(20)); // Actual measurement time. // Benchmarks for forward search let mut group = criterion.benchmark_group("substring-forward"); From 1176473971fa71165536c725b597a0f4d19d31b2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 24 Mar 2025 19:08:28 +0000 Subject: [PATCH 28/29] Improve: Style as search benchmarks --- bench_hash.rs | 156 ++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 70 deletions(-) diff --git a/bench_hash.rs b/bench_hash.rs index 4efb298..a5e8a8f 100644 --- a/bench_hash.rs +++ b/bench_hash.rs @@ -1,7 +1,10 @@ #![doc = r#" # StringWa.rs: String Hashing Benchmarks -This file contains benchmarks for various Rust hashing libraries using Criterion. +This file contains benchmarks for various Rust hashing libraries using Criterion, +treating the inputs as binary strings without any UTF-8 validity constrains. +For accurate stats aggregation, on each iteration, the whole file is scanned. +Be warned, for large files, it may take a while! The benchmarks compare the performance of different hash functions including: @@ -20,7 +23,7 @@ The benchmarks use two environment variables to control the input dataset and mo - `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: - `lines`: Process the dataset line by line. - `words`: Process the dataset word by word. - - `file`: Process the entire file as a single unit. + - `file`: Process the entire file as a single token. To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: @@ -31,11 +34,10 @@ RUSTFLAGS="-C target-cpu=native" \ cargo criterion --features bench_hash bench_hash --jobs 8 ``` -## Notes - -- Ensure your CPU supports the required AES and SSE2 instructions when using `gxhash`. +For `gxhash`, ensure that your CPU supports the required AES and SSE2 instructions. "#] use std::env; +use std::error::Error; use std::fs; use criterion::{black_box, Criterion, Throughput}; @@ -47,60 +49,48 @@ use std::hash::{BuildHasher, Hasher}; use stringzilla::sz; use xxhash_rust::xxh3::xxh3_64; -fn log_stringzilla_metadata() { - let v = sz::version(); - println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); - println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); - println!("- capabilities: {}", sz::capabilities().as_str()); +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) } -fn bench_hash(c: &mut Criterion) { - let dataset_path = - env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a [u8]) -> Result, Box> { let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); - - let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); - let units: Vec<&str> = match mode.as_str() { - "lines" => content.lines().collect(), - "words" => content.split_whitespace().collect(), - "file" => { - // In "file" mode, treat the entire content as a single unit. - vec![&content] + let tokens = match mode.as_str() { + "lines" => haystack.split(|&c| c == b'\n').collect(), + "words" => haystack.split(|&c| c == b'\n' || c == b' ').collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) } - other => panic!( - "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", - other - ), }; - - if units.is_empty() { - panic!("No data found for hashing in the provided dataset."); - } - - // Calculate total bytes processed for throughput reporting. - let total_bytes: usize = units.iter().map(|u| u.len()).sum(); - - let mut g = c.benchmark_group("stateful"); - g.throughput(Throughput::Bytes(total_bytes as u64)); - stateful_benchmarks(&mut g, &units); - g.finish(); - - let mut g = c.benchmark_group("stateless"); - g.throughput(Throughput::Bytes(total_bytes as u64)); - stateless_benchmarks(&mut g, &units); - g.finish(); + Ok(tokens) } -fn stateless_benchmarks( +/// Benchmarks stateless hashes seeing the whole input at once +fn bench_stateless( group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - units: &[&str], + tokens: &[&[u8]], ) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + group.throughput(Throughput::Bytes(total_bytes as u64)); + // Benchmark: StringZilla `bytesum` group.bench_function("stringzilla::bytesum", |b| { b.iter(|| { - for unit in units { + for token in tokens { // Using black_box to prevent compiler optimizations. - let _hash = sz::bytesum(black_box(unit.as_bytes())); + let _hash = sz::bytesum(black_box(token)); } }) }); @@ -108,8 +98,8 @@ fn stateless_benchmarks( // Benchmark: StringZilla `hash` group.bench_function("stringzilla::hash", |b| { b.iter(|| { - for unit in units { - let _hash = sz::hash(black_box(unit.as_bytes())); + for token in tokens { + let _hash = sz::hash(black_box(token)); } }) }); @@ -118,9 +108,9 @@ fn stateless_benchmarks( group.bench_function("std::hash::BuildHasher", |b| { let std_builder = std::collections::hash_map::RandomState::new(); b.iter(|| { - for unit in units { + for token in tokens { let mut hasher = std_builder.build_hasher(); - hasher.write(unit.as_bytes()); + hasher.write(token); let _hash = black_box(hasher.finish()); } }) @@ -130,8 +120,8 @@ fn stateless_benchmarks( group.bench_function("aHash::hash_one", |b| { let hash_builder = RandomState::with_seed(42); b.iter(|| { - for unit in units { - let _hash = black_box(hash_builder.hash_one(unit.as_bytes())); + for token in tokens { + let _hash = black_box(hash_builder.hash_one(token)); } }) }); @@ -139,8 +129,8 @@ fn stateless_benchmarks( // Benchmark: xxHash (`xxh3`) group.bench_function("xxh3::xxh3_64", |b| { b.iter(|| { - for unit in units { - let _hash = black_box(xxh3_64(unit.as_bytes())); + for token in tokens { + let _hash = black_box(xxh3_64(token)); } }) }); @@ -148,8 +138,8 @@ fn stateless_benchmarks( // Benchmark: gxhash group.bench_function("gxhash::gxhash64", |b| { b.iter(|| { - for unit in units { - let _hash = black_box(gxhash::gxhash64(unit.as_bytes(), 42)); + for token in tokens { + let _hash = black_box(gxhash::gxhash64(token, 42)); } }) }); @@ -157,23 +147,28 @@ fn stateless_benchmarks( // Benchmark: Blake3 - should be by far the slowest, as it's a cryptographic hash. group.bench_function("blake3", |b| { b.iter(|| { - for unit in units { - let _hash = black_box(blake3::hash(unit.as_bytes())); + for token in tokens { + let _hash = black_box(blake3::hash(token)); } }) }); } -fn stateful_benchmarks( +/// Benchmarks stateful hashes seeing one slice at a time +fn bench_stateful( group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - units: &[&str], + tokens: &[&[u8]], ) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + group.throughput(Throughput::Bytes(total_bytes as u64)); + // Benchmark: StringZilla `bytesum` group.bench_function("stringzilla::bytesum", |b| { b.iter(|| { let mut aggregate = 0u64; - for unit in units { - aggregate += sz::bytesum(unit.as_bytes()); + for token in tokens { + aggregate += sz::bytesum(token); } black_box(aggregate); }) @@ -183,8 +178,8 @@ fn stateful_benchmarks( group.bench_function("stringzilla::HashState", |b| { b.iter(|| { let mut aggregate = sz::HashState::new(0); - for unit in units { - aggregate.stream(unit.as_bytes()); + for token in tokens { + aggregate.stream(token); } black_box(aggregate.fold()); }) @@ -195,8 +190,8 @@ fn stateful_benchmarks( let std_builder = std::collections::hash_map::RandomState::new(); b.iter(|| { let mut aggregate = std_builder.build_hasher(); - for unit in units { - aggregate.write(unit.as_bytes()); + for token in tokens { + aggregate.write(token); } black_box(aggregate.finish()); }) @@ -206,8 +201,8 @@ fn stateful_benchmarks( group.bench_function("aHash::AHasher", |b| { b.iter(|| { let mut aggregate = AHasher::default(); - for unit in units { - aggregate.write(unit.as_bytes()); + for token in tokens { + aggregate.write(token); } black_box(aggregate.finish()); }) @@ -215,13 +210,34 @@ fn stateful_benchmarks( } fn main() { - log_stringzilla_metadata(); + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let dataset = load_dataset().unwrap(); + let tokens = tokenize(&dataset).unwrap(); + if tokens.is_empty() { + panic!("No tokens found in the dataset."); + } + let mut criterion = Criterion::default() .configure_from_args() .sample_size(30) // Number of samples to collect. .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. .measurement_time(std::time::Duration::from_secs(10)); // Actual measurement time. - bench_hash(&mut criterion); + // Profile hash functions that see the whole input at once + let mut group = criterion.benchmark_group("stateful"); + bench_stateful(&mut group, &tokens); + group.finish(); + + // Profile incremental hash functions that see only a slice of data at a time + let mut group = criterion.benchmark_group("stateless"); + bench_stateless(&mut group, &tokens); + group.finish(); + criterion.final_summary(); } From 38cefd68e01d5e3ac3cae5b2d0cb1fb444ebaf74 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 24 Mar 2025 19:08:44 +0000 Subject: [PATCH 29/29] Add: Memory-system benchmarks --- Cargo.lock | 401 ++++++++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 73 ++++++++- bench_memory.rs | 201 ++++++++++++++++++++++++ 3 files changed, 642 insertions(+), 33 deletions(-) create mode 100644 bench_memory.rs diff --git a/Cargo.lock b/Cargo.lock index e23cff7..3d84cae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,10 +10,10 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.12", "once_cell", "version_check", - "zerocopy", + "zerocopy 0.7.35", ] [[package]] @@ -333,7 +333,7 @@ dependencies = [ "num-traits", "ordered-float", "petgraph", - "rand", + "rand 0.8.5", "regex", "serde", "serde_derive", @@ -379,6 +379,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + [[package]] name = "blake3" version = "1.6.1" @@ -453,6 +459,8 @@ version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -471,7 +479,7 @@ dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", - "windows-targets", + "windows-targets 0.52.0", ] [[package]] @@ -501,6 +509,26 @@ dependencies = [ "half", ] +[[package]] +name = "clang" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c044c781163c001b913cd018fc95a628c50d0d2dfea8bca77dad71edb16e37" +dependencies = [ + "clang-sys", + "libc", +] + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", +] + [[package]] name = "clap" version = "4.5.1" @@ -541,7 +569,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.12", "once_cell", "tiny-keccak", ] @@ -663,6 +691,12 @@ dependencies = [ "syn", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "editdistancek" version = "1.0.2" @@ -719,7 +753,7 @@ version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ - "bitflags", + "bitflags 1.3.2", "rustc_version 0.4.1", ] @@ -740,9 +774,27 @@ checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets 0.52.0", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + [[package]] name = "gxhash" version = "3.4.1" @@ -792,7 +844,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.52.0", ] [[package]] @@ -849,6 +901,15 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.68" @@ -930,9 +991,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.153" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libm" @@ -982,7 +1043,7 @@ dependencies = [ "num-complex", "num-rational", "num-traits", - "rand", + "rand 0.8.5", "rand_distr", "simba", "typenum", @@ -1098,6 +1159,41 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "opencv" +version = "0.94.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f80fd7d018d20b1e49bdd65e72350f1f63cad6bc9c15f850c47c31a6ad8d0d20" +dependencies = [ + "cc", + "dunce", + "jobserver", + "libc", + "num-traits", + "once_cell", + "opencv-binding-generator", + "pkg-config", + "semver 1.0.26", + "shlex", + "vcpkg", + "windows", +] + +[[package]] +name = "opencv-binding-generator" +version = "0.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7283829fe440be381fea73521f850b287fd44f994acd6453e1e19b3d479ef7fc" +dependencies = [ + "clang", + "clang-sys", + "dunce", + "once_cell", + "percent-encoding", + "regex", + "shlex", +] + [[package]] name = "ordered-float" version = "5.0.0" @@ -1113,6 +1209,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "petgraph" version = "0.7.1" @@ -1123,6 +1225,12 @@ dependencies = [ "indexmap", ] +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "plotters" version = "0.3.5" @@ -1197,8 +1305,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", + "zerocopy 0.8.24", ] [[package]] @@ -1208,7 +1327,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -1217,7 +1346,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.12", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.1", ] [[package]] @@ -1227,7 +1365,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" dependencies = [ "num-traits", - "rand", + "rand 0.8.5", +] + +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.3", ] [[package]] @@ -1416,7 +1563,7 @@ dependencies = [ "approx", "nalgebra", "num-traits", - "rand", + "rand 0.8.5", ] [[package]] @@ -1430,14 +1577,20 @@ dependencies = [ "blake3", "bstr", "criterion", + "getrandom 0.3.1", "gxhash", "memchr", - "rand", + "opencv", + "rand 0.9.0", + "rand_chacha 0.9.0", + "rand_xoshiro", "rapidfuzz", "rayon", "regex", "stringzilla", + "twox-hash", "xxhash-rust", + "zeroize", ] [[package]] @@ -1542,6 +1695,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "typenum" version = "1.18.0" @@ -1554,6 +1716,12 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "vec_map" version = "0.8.2" @@ -1585,6 +1753,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.91" @@ -1690,13 +1867,82 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f919aee0a93304be7f62e8e5027811bbba96bcb1de84d6618be56e43f8a32a1" +dependencies = [ + "windows-core 0.59.0", + "windows-targets 0.53.0", +] + [[package]] name = "windows-core" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets", + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-core" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "810ce18ed2112484b0d4e15d022e5f598113e220c53e373fb31e67e21670c1ce" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-strings", + "windows-targets 0.53.0", +] + +[[package]] +name = "windows-implement" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83577b051e2f49a058c308f17f273b570a6a758386fc291b5f6a934dd84e48c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26fd936d991781ea39e87c3a27285081e3c0da5ca0fcbc02d368cc6f52ff01" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dccfd733ce2b1753b03b6d3c65edf020262ea35e20ccdf3e288043e6dd620e3" + +[[package]] +name = "windows-result" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06374efe858fab7e4f881500e6e86ec8bc28f9462c47e5a9941a0142ad86b189" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319" +dependencies = [ + "windows-link", ] [[package]] @@ -1705,7 +1951,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.0", ] [[package]] @@ -1714,13 +1960,29 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1729,42 +1991,99 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.9.0", +] + [[package]] name = "xxhash-rust" version = "0.8.12" @@ -1777,7 +2096,16 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ - "zerocopy-derive", + "zerocopy-derive 0.7.35", +] + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive 0.8.24", ] [[package]] @@ -1790,3 +2118,20 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/Cargo.toml b/Cargo.toml index f56e609..6f96f6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,17 +4,46 @@ version = "0.1.0" edition = "2018" [dependencies] -rand = "0.8.5" criterion = "0.5.1" # stringzilla = { version = "3.3.0" } stringzilla = { path = "../StringZilla/" } # Feature-based dependencies for benchmarks [features] -bench_find = ["memchr", "bstr", "aho-corasick", "regex"] -bench_hash = ["ahash", "xxhash-rust", "blake3", "gxhash"] -bench_sequence = ["arrow", "rayon"] -bench_similarity = ["rapidfuzz", "bio"] +bench_find = [ + "memchr", # Substring Search + "bstr", # Byteset Search + "aho-corasick", # Byteset Search + "regex", # Byteset Search +] +bench_hash = [ + "ahash", # Hashing + "xxhash-rust", # One-Shot Hashing + "twox-hash", # One-Shot Hashing (same algo as xxHash) + "gxhash", # One-Shot Hashing + "blake3", # Cryptographic Hashing Baseline +] +bench_sequence = [ + "arrow", # Sorting + "rayon", # Parallel Sorting +] +bench_similarity = [ + "rapidfuzz", # Levenshtein Distance (also `strsim`) + "bio", # Needleman-Wunsch Score +] +bench_memory = [ + # "opencv", # Lookup Transform + "rand", # Randomize Buffer + "zeroize", # Obfuscate Buffer + "getrandom", # Randomize Buffer via OS + "rand_chacha", # Randomize Buffer + "rand_xoshiro", # Randomize Buffer +] + +# TODO: String properties: +# - `unicode-width` +# - `textwrap` +# - `unicode-segmentation` [dependencies.memchr] version = "2.7.1" @@ -60,6 +89,10 @@ version = "0.8" optional = true features = ["xxh3", "const_xxh3"] +[dependencies.twox-hash] +version = "2.1.0" +optional = true + [dependencies.arrow] version = "54.2.1" optional = true @@ -68,6 +101,30 @@ optional = true version = "1.10.0" optional = true +[dependencies.opencv] +version = "0.94.2" +optional = true + +[dependencies.zeroize] +version = "1.8.1" +optional = true + +[dependencies.getrandom] +version = "0.3.1" +optional = true + +[dependencies.rand] +version = "0.9.0" +optional = true + +[dependencies.rand_chacha] +version = "0.9.0" +optional = true + +[dependencies.rand_xoshiro] +version = "0.7.0" +optional = true + [[bench]] name = "bench_find" path = "bench_find.rs" @@ -91,3 +148,9 @@ name = "bench_sequence" path = "bench_sequence.rs" harness = false required-features = ["bench_sequence"] + +[[bench]] +name = "bench_memory" +path = "bench_memory.rs" +harness = false +required-features = ["bench_memory"] diff --git a/bench_memory.rs b/bench_memory.rs new file mode 100644 index 0000000..b278c8a --- /dev/null +++ b/bench_memory.rs @@ -0,0 +1,201 @@ +#![doc = r#" +# StringWa.rs: Low-level Memory-related Benchmarks + +This file benchmarks low-level memory operations. The input file is treated as a collection +of size-representative tokens and for every token the following operations are benchmarked: + +- case inversion using Lookup Table Transforms (LUT), common in image processing +- memory obfuscation using Pseudo-Random Number Generators (PRNG), common in sensitive apps + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_memory bench_memory --jobs 8 +``` +"#] +use std::env; +use std::error::Error; +use std::fs; +use std::time::Duration; + +use criterion::{black_box, Criterion, Throughput}; + +use getrandom; +use rand; +use rand::{RngCore, SeedableRng}; +use rand_chacha; +use rand_xoshiro; +use stringzilla::sz; +use zeroize::Zeroize; + +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) +} + +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a mut [u8]) -> Result, Box> { + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + let tokens = match mode.as_str() { + "lines" => haystack.split_mut(|&c| c == b'\n').collect(), + "words" => haystack.split_mut(|&c| c == b'\n' || c == b' ').collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) + } + }; + Ok(tokens) +} + +fn bench_lookup_table( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &mut [&mut [u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + g.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark for StringZilla forward search using a cycle iterator. + let mut lookup_invert_case: [u8; 256] = core::array::from_fn(|i| i as u8); + for (upper, lower) in ('A'..='Z').zip('a'..='z') { + lookup_invert_case[upper as usize] = lower as u8; + } + for (upper, lower) in ('A'..='Z').zip('a'..='z') { + lookup_invert_case[lower as usize] = upper as u8; + } + + // Benchmark using StringZilla's `lookup_inplace`. + g.bench_function("sz::lookup_inplace", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + sz::lookup_inplace(&mut *token, lookup_invert_case); + black_box(token); + } + }) + }); + + // Benchmark a plain serial mapping using the same lookup table. + g.bench_function("serial", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + for byte in token.iter_mut() { + *byte = lookup_invert_case[*byte as usize]; + } + black_box(&token); + } + }) + }); +} + +fn bench_generate_random( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &mut [&mut [u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + g.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark for StringZilla AES-based PRNG + g.bench_function("sz::fill_random", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + sz::fill_random(&mut *token, 0) + } + }) + }); + + // Benchmark using zeroize to obfuscate (zero out) the buffer. + g.bench_function("zeroize::zeroize", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + token.zeroize(); + black_box(&token); + } + }) + }); + + // Benchmark using `getrandom` to randomize the buffer via the OS. + g.bench_function("getrandom::fill", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + getrandom::fill(&mut *token).expect("getrandom failed"); + black_box(&token); + } + }) + }); + + // Benchmark using `rand_chacha::ChaCha20Rng`. + g.bench_function("rand_chacha::ChaCha20Rng", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + let mut rng = rand_chacha::ChaCha20Rng::from_seed([0u8; 32]); + rng.fill_bytes(&mut *token); + black_box(&token); + } + }) + }); + + // Benchmark using `rand_xoshiro::Xoshiro128Plus`. + g.bench_function("rand_xoshiro::Xoshiro128Plus", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + let mut rng = rand_xoshiro::Xoshiro128Plus::from_seed([0u8; 16]); + rng.fill_bytes(&mut *token); + black_box(&token); + } + }) + }); +} + +fn main() { + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let mut dataset = load_dataset().unwrap(); + let mut tokens = tokenize(&mut dataset).unwrap(); + if tokens.is_empty() { + panic!("No tokens found in the dataset."); + } + + // Setup the default durations + let mut criterion = Criterion::default() + .sample_size(10) // Each loop scans the whole dataset. + .warm_up_time(Duration::from_secs(1)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(20)); // Actual measurement time. + + // Benchmarks for lookup table transform + let mut group = criterion.benchmark_group("lookup-table"); + bench_lookup_table(&mut group, &mut tokens[..]); + group.finish(); + + // Benchmarks for random string generation + let mut group = criterion.benchmark_group("generate-random"); + bench_generate_random(&mut group, &mut tokens[..]); + group.finish(); + + criterion.final_summary(); +}