From deb2e926c44f72202531b6f44655b526493c0576 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 13:20:18 +0100 Subject: [PATCH 1/8] Experiments with Hash::write string hashing improvements from rapidhash --- src/fast.rs | 66 +++++++++++++++++++++++++++-------------------------- src/lib.rs | 51 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 37 deletions(-) diff --git a/src/fast.rs b/src/fast.rs index a6f0f1e..b185eb4 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -3,7 +3,7 @@ use core::hash::{BuildHasher, Hasher}; use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed}; -use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, ARBITRARY3}; +use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, rotate_right, ARBITRARY3}; /// A [`Hasher`] instance implementing foldhash, optimized for speed. /// @@ -12,13 +12,13 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, A /// [`FixedState`] to create [`FoldHasher`]s. #[derive(Clone)] pub struct FoldHasher { - accumulator: u64, + pub(crate) accumulator: u64, sponge: u128, sponge_len: u8, - fold_seed: u64, - expand_seed: u64, - expand_seed2: u64, - expand_seed3: u64, + pub(crate) fold_seed: u64, + pub(crate) expand_seed: u64, + pub(crate) expand_seed2: u64, + pub(crate) expand_seed3: u64, } impl FoldHasher { @@ -62,41 +62,43 @@ impl Hasher for FoldHasher { // which costs only a single cycle (or none if executed with // instruction-level parallelism). let len = bytes.len(); - let base_seed = rotate_right(self.accumulator, len as u32); - if len <= 16 { - let mut s0 = base_seed; - let mut s1 = self.expand_seed; + + // moving self.accumulator outside of this if block improves performance, I'm surprised the + // compiler can't do this automatically + self.accumulator = if len <= 16 { + let accumulator = self.accumulator; + let mut s0 = 0; + let mut s1 = 0; + // XOR the input into s0, s1, then multiply and fold. if len >= 8 { - s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap()); - s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap()); + s0 = u64::from_ne_bytes(bytes[0..8].try_into().unwrap()); + s1 = u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap()); } else if len >= 4 { - s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64; - s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64; + s0 = u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64; + s1 = u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64; } else if len > 0 { let lo = bytes[0]; let mid = bytes[len / 2]; let hi = bytes[len - 1]; - s0 ^= lo as u64; - s1 ^= ((hi as u64) << 8) | mid as u64; + s0 = hi as u64; + s1 = ((lo as u64) << 45) | mid as u64; } - self.accumulator = folded_multiply(s0, s1); - } else if len < 256 { - self.accumulator = hash_bytes_medium( - bytes, - base_seed, - base_seed.wrapping_add(self.expand_seed), - self.fold_seed, - ); + + // I prefer to wrapping add the length here, as not all platforms have a rotation, and + // although it has a smaller impact on the output hash, rapidhash's output quality and + // collision studies suggested this or an XOR are sufficient. Moving this to the bottom + // of the function appears to improve performance. + s0 ^= self.fold_seed; + s1 ^= accumulator.wrapping_add(len as u64); + + folded_multiply(s0, s1) + } else if len < 256 { // TODO: could increase to 288? + // minimise the number of arguments, let the compiler choose what's best regarding + // register allocation, and make the assembly for this branch smaller + rapidhash_core_16_288(&self, bytes) } else { - self.accumulator = hash_bytes_long( - bytes, - base_seed, - base_seed.wrapping_add(self.expand_seed), - base_seed.wrapping_add(self.expand_seed2), - base_seed.wrapping_add(self.expand_seed3), - self.fold_seed, - ); + hash_bytes_long(&self, bytes) } } diff --git a/src/lib.rs b/src/lib.rs index ab04556..d11e345 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -114,6 +114,7 @@ pub use seed::SharedSeed; mod convenience; #[cfg(feature = "std")] pub use convenience::*; +use crate::fast::FoldHasher; // Arbitrary constants with high entropy. Hexadecimal digits of pi were used. const ARBITRARY0: u64 = 0x243f6a8885a308d3; @@ -246,17 +247,57 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> s0 ^ s1 } +#[cold] +#[must_use] +fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { + let mut seed = hasher.accumulator; + let mut slice = data; + + if slice.len() > 48 { + let mut see1 = seed; + let mut see2 = seed; + + while slice.len() >= 48 { + seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); + see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1); + see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ hasher.expand_seed3, u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2); + let (_, split) = slice.split_at(48); + slice = split; + } + + seed ^= see1 ^ see2; + } + + if slice.len() > 16 { + seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); + if slice.len() > 32 { + seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed); + } + } + + let mut a = u64::from_ne_bytes( data[data.len() - 16..data.len() - 8].try_into().unwrap()); + let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap()); + + seed = seed.wrapping_add(data.len() as u64); + a ^= hasher.expand_seed2; + b ^= seed; + folded_multiply(a, b) +} + /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16. #[cold] #[inline(never)] fn hash_bytes_long( + hasher: &FoldHasher, bytes: &[u8], - mut s0: u64, - mut s1: u64, - mut s2: u64, - mut s3: u64, - fold_seed: u64, ) -> u64 { + let base_seed = rotate_right(hasher.accumulator, bytes.len() as u32); + let fold_seed = hasher.fold_seed; + let mut s0 = base_seed; + let mut s1 = base_seed.wrapping_add(hasher.expand_seed); + let mut s2 = base_seed.wrapping_add(hasher.expand_seed2); + let mut s3 = base_seed.wrapping_add(hasher.expand_seed3); + let chunks = bytes.chunks_exact(64); let remainder = chunks.remainder().len(); for chunk in chunks { From 6611486864e53af0cf6ac0e99ccf5006e5dd14e4 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 14:02:05 +0100 Subject: [PATCH 2/8] Add rapidhash to the foldhash benchmarks to make sure I've not unfairly run the rapidhash benchmarks somehow --- Cargo.toml | 1 + benches/bench.rs | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index b6c229d..1821e57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ criterion = "0.5" hashbrown = "0.14" uuid = "1.8" rand = "0.8" +rapidhash = "3.0" ahash = "0.8" fxhash = "0.2" chrono = "0.4" diff --git a/benches/bench.rs b/benches/bench.rs index 81cc2e1..6a3a1a4 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -147,24 +147,28 @@ fn profile_distr(distr: D, map_size: usize, c: &mut Criterion) profile_hashonly::("foldhash-fast", distr.clone(), c); profile_hashonly::("foldhash-quality", distr.clone(), c); + profile_hashonly::("rapidhash-fast", distr.clone(), c); profile_hashonly::("fxhash", distr.clone(), c); profile_hashonly::("ahash", distr.clone(), c); profile_hashonly::("siphash", distr.clone(), c); profile_lookup_miss::("foldhash-fast", distr.clone(), map_size, c); profile_lookup_miss::("foldhash-quality", distr.clone(), map_size, c); + profile_lookup_miss::("rapidhash-fast", distr.clone(), map_size, c); profile_lookup_miss::("fxhash", distr.clone(), map_size, c); profile_lookup_miss::("ahash", distr.clone(), map_size, c); profile_lookup_miss::("siphash", distr.clone(), map_size, c); profile_lookup_hit::("foldhash-fast", distr.clone(), map_size, c); profile_lookup_hit::("foldhash-quality", distr.clone(), map_size, c); + profile_lookup_hit::("rapidhash-fast", distr.clone(), map_size, c); profile_lookup_hit::("fxhash", distr.clone(), map_size, c); profile_lookup_hit::("ahash", distr.clone(), map_size, c); profile_lookup_hit::("siphash", distr.clone(), map_size, c); profile_set_build::("foldhash-fast", distr.clone(), map_size, c); profile_set_build::("foldhash-quality", distr.clone(), map_size, c); + profile_set_build::("rapidhash-fast", distr.clone(), map_size, c); profile_set_build::("fxhash", distr.clone(), map_size, c); profile_set_build::("ahash", distr.clone(), map_size, c); profile_set_build::("siphash", distr.clone(), map_size, c); From 48aa9b7404f3c72036f1cb409efb72e8215d97e5 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 14:17:55 +0100 Subject: [PATCH 3/8] Mark medium as inline(never) not cold, medium length performance improvement, englishword -5%, url -6% --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d11e345..619b41f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -247,7 +247,7 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> s0 ^ s1 } -#[cold] +#[inline(never)] #[must_use] fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { let mut seed = hasher.accumulator; From 17229ec8d86fe203747b75b08569135099726e65 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 15:20:21 +0100 Subject: [PATCH 4/8] Store a reference to the seeds array in FoldHasher, struuid -24.5%, strdate -38% --- src/fast.rs | 30 ++++++++++++------------------ src/lib.rs | 30 +++++++++++++++--------------- src/quality.rs | 2 +- 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/src/fast.rs b/src/fast.rs index b185eb4..82b9ca9 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -13,27 +13,21 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_ #[derive(Clone)] pub struct FoldHasher { pub(crate) accumulator: u64, + pub(crate) seeds: &'static [u64; 4], sponge: u128, sponge_len: u8, - pub(crate) fold_seed: u64, - pub(crate) expand_seed: u64, - pub(crate) expand_seed2: u64, - pub(crate) expand_seed3: u64, } impl FoldHasher { /// Initializes this [`FoldHasher`] with the given per-hasher seed and /// [`SharedSeed`]. #[inline] - pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher { + pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher { FoldHasher { accumulator: per_hasher_seed, + seeds: &shared_seed.seeds, sponge: 0, sponge_len: 0, - fold_seed: shared_seed.seeds[0], - expand_seed: shared_seed.seeds[1], - expand_seed2: shared_seed.seeds[2], - expand_seed3: shared_seed.seeds[3], } } @@ -43,7 +37,7 @@ impl FoldHasher { if self.sponge_len as usize + bits > 128 { let lo = self.sponge as u64; let hi = (self.sponge >> 64) as u64; - self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed); + self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]); self.sponge = x.into(); self.sponge_len = bits as u8; } else { @@ -89,16 +83,16 @@ impl Hasher for FoldHasher { // although it has a smaller impact on the output hash, rapidhash's output quality and // collision studies suggested this or an XOR are sufficient. Moving this to the bottom // of the function appears to improve performance. - s0 ^= self.fold_seed; + s0 ^= self.seeds[0]; s1 ^= accumulator.wrapping_add(len as u64); folded_multiply(s0, s1) - } else if len < 256 { // TODO: could increase to 288? - // minimise the number of arguments, let the compiler choose what's best regarding - // register allocation, and make the assembly for this branch smaller - rapidhash_core_16_288(&self, bytes) + } else if len <= 288 { + // minimising the number of arguments, but self.accumulator and self.seeds can already + // be loaded into registers in this function, so passing them directly appears faster + rapidhash_core_16_288(self.accumulator, self.seeds, bytes) } else { - hash_bytes_long(&self, bytes) + hash_bytes_long(self.accumulator, self.seeds, bytes) } } @@ -126,7 +120,7 @@ impl Hasher for FoldHasher { fn write_u128(&mut self, i: u128) { let lo = i as u64; let hi = (i >> 64) as u64; - self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed); + self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]); } #[inline(always)] @@ -143,7 +137,7 @@ impl Hasher for FoldHasher { if self.sponge_len > 0 { let lo = self.sponge as u64; let hi = (self.sponge >> 64) as u64; - folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed) + folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]) } else { self.accumulator } diff --git a/src/lib.rs b/src/lib.rs index 619b41f..aa500ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -248,9 +248,8 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> } #[inline(never)] -#[must_use] -fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { - let mut seed = hasher.accumulator; +fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 { + let mut seed = accumulator; let mut slice = data; if slice.len() > 48 { @@ -258,9 +257,9 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { let mut see2 = seed; while slice.len() >= 48 { - seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); - see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1); - see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ hasher.expand_seed3, u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2); + seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); + see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1); + see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ seeds[3], u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2); let (_, split) = slice.split_at(48); slice = split; } @@ -269,9 +268,9 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { } if slice.len() > 16 { - seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); + seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); if slice.len() > 32 { - seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed); + seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed); } } @@ -279,7 +278,7 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap()); seed = seed.wrapping_add(data.len() as u64); - a ^= hasher.expand_seed2; + a ^= seeds[2]; b ^= seed; folded_multiply(a, b) } @@ -288,15 +287,16 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 { #[cold] #[inline(never)] fn hash_bytes_long( - hasher: &FoldHasher, + accumulator: u64, + seeds: &[u64; 4], bytes: &[u8], ) -> u64 { - let base_seed = rotate_right(hasher.accumulator, bytes.len() as u32); - let fold_seed = hasher.fold_seed; + let base_seed = rotate_right(accumulator, bytes.len() as u32); + let fold_seed = seeds[0]; let mut s0 = base_seed; - let mut s1 = base_seed.wrapping_add(hasher.expand_seed); - let mut s2 = base_seed.wrapping_add(hasher.expand_seed2); - let mut s3 = base_seed.wrapping_add(hasher.expand_seed3); + let mut s1 = base_seed.wrapping_add(seeds[1]); + let mut s2 = base_seed.wrapping_add(seeds[2]); + let mut s3 = base_seed.wrapping_add(seeds[3]); let chunks = bytes.chunks_exact(64); let remainder = chunks.remainder().len(); diff --git a/src/quality.rs b/src/quality.rs index 939b60e..ce1dd5a 100644 --- a/src/quality.rs +++ b/src/quality.rs @@ -20,7 +20,7 @@ impl FoldHasher { /// Initializes this [`FoldHasher`] with the given per-hasher seed and /// [`SharedSeed`]. #[inline(always)] - pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher { + pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher { FoldHasher { inner: fast::FoldHasher::with_seed(per_hasher_seed, shared_seed), } From 17d5672c936ac6ad6dfb48b5d5d44f5a9086c3fb Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 17:55:44 +0100 Subject: [PATCH 5/8] Add read_u64 and read_u32 helper methods to test bounds checks --- src/fast.rs | 24 ++++++++++-------------- src/lib.rs | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/src/fast.rs b/src/fast.rs index 82b9ca9..163f410 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -3,7 +3,7 @@ use core::hash::{BuildHasher, Hasher}; use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed}; -use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, rotate_right, ARBITRARY3}; +use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, read_u32, read_u64, rotate_right, ARBITRARY3}; /// A [`Hasher`] instance implementing foldhash, optimized for speed. /// @@ -50,27 +50,23 @@ impl FoldHasher { impl Hasher for FoldHasher { #[inline(always)] fn write(&mut self, bytes: &[u8]) { - // We perform overlapping reads in the byte hash which could lead to - // trivial length-extension attacks. These should be defeated by - // adding a length-dependent rotation on our unpredictable seed - // which costs only a single cycle (or none if executed with - // instruction-level parallelism). + let accumulator = self.accumulator; + let seeds = self.seeds; let len = bytes.len(); // moving self.accumulator outside of this if block improves performance, I'm surprised the // compiler can't do this automatically self.accumulator = if len <= 16 { - let accumulator = self.accumulator; let mut s0 = 0; let mut s1 = 0; // XOR the input into s0, s1, then multiply and fold. if len >= 8 { - s0 = u64::from_ne_bytes(bytes[0..8].try_into().unwrap()); - s1 = u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap()); + s0 = read_u64(bytes, 0); + s1 = read_u64(bytes, len - 8); } else if len >= 4 { - s0 = u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64; - s1 = u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64; + s0 = read_u32(bytes, 0) as u64; + s1 = read_u32(bytes, len - 4) as u64; } else if len > 0 { let lo = bytes[0]; let mid = bytes[len / 2]; @@ -83,16 +79,16 @@ impl Hasher for FoldHasher { // although it has a smaller impact on the output hash, rapidhash's output quality and // collision studies suggested this or an XOR are sufficient. Moving this to the bottom // of the function appears to improve performance. - s0 ^= self.seeds[0]; + s0 ^= seeds[0]; s1 ^= accumulator.wrapping_add(len as u64); folded_multiply(s0, s1) } else if len <= 288 { // minimising the number of arguments, but self.accumulator and self.seeds can already // be loaded into registers in this function, so passing them directly appears faster - rapidhash_core_16_288(self.accumulator, self.seeds, bytes) + rapidhash_core_16_288(accumulator, seeds, bytes) } else { - hash_bytes_long(self.accumulator, self.seeds, bytes) + hash_bytes_long(accumulator, seeds, bytes) } } diff --git a/src/lib.rs b/src/lib.rs index aa500ed..6bd894f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -221,6 +221,30 @@ const fn rotate_right(x: u64, r: u32) -> u64 { } } +/// A helper method for doing an unaligned 32-bit read from a byte slice. +#[inline(always)] +fn read_u32(slice: &[u8], offset: usize) -> u32 { + debug_assert!(slice.len() >= 4 + offset); + u32::from_ne_bytes(slice[offset..offset + 4].try_into().unwrap()) + + // Uncomment the following to explicitly omit bounds checks for debugging: + // debug_assert!(offset as isize >= 0); + // debug_assert!(slice.len() >= 4 + offset); + // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) } +} + +/// A helper method for doing an unaligned 64-bit read from a byte slice. +#[inline(always)] +fn read_u64(slice: &[u8], offset: usize) -> u64 { + debug_assert!(slice.len() >= 8 + offset); + u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap()) + + // Uncomment the following to explicitly omit bounds checks for debugging: + // debug_assert!(offset as isize >= 0); + // debug_assert!(slice.len() >= 4 + offset); + // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) } +} + /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16. fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 { // Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from @@ -257,9 +281,9 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 let mut see2 = seed; while slice.len() >= 48 { - seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); - see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1); - see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ seeds[3], u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2); + seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed); + see1 = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ see1); + see2 = folded_multiply(read_u64(slice, 32) ^ seeds[3], read_u64(slice, 40) ^ see2); let (_, split) = slice.split_at(48); slice = split; } @@ -268,14 +292,14 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 } if slice.len() > 16 { - seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed); + seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed); if slice.len() > 32 { - seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed); + seed = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ seed); } } - let mut a = u64::from_ne_bytes( data[data.len() - 16..data.len() - 8].try_into().unwrap()); - let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap()); + let mut a = read_u64(data, data.len() - 16); + let mut b = read_u64(data, data.len() - 8); seed = seed.wrapping_add(data.len() as u64); a ^= seeds[2]; From bd2b1ffc64be13c24958a46d341a8959afa5e222 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 18:28:17 +0100 Subject: [PATCH 6/8] Set codegen-units=1 and incremental=false for consistent benchmarking --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 1821e57..aa1637a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,3 +43,6 @@ harness = false [profile.release] lto = "thin" +codegen-units = 1 +incremental = false +debug-assertions = false \ No newline at end of file From 6500c8c609cab980886d852f9e2fec3d35a47c8d Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 18:29:04 +0100 Subject: [PATCH 7/8] Re-implement read_u64 and read_u32 in a safe manner that the compiler can omit the bounds checks for --- src/fast.rs | 22 +++++++++++----------- src/lib.rs | 28 ++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/fast.rs b/src/fast.rs index 163f410..022ec2b 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -52,25 +52,25 @@ impl Hasher for FoldHasher { fn write(&mut self, bytes: &[u8]) { let accumulator = self.accumulator; let seeds = self.seeds; - let len = bytes.len(); + // let len = bytes.len(); // moving self.accumulator outside of this if block improves performance, I'm surprised the // compiler can't do this automatically - self.accumulator = if len <= 16 { + self.accumulator = if bytes.len() <= 16 { let mut s0 = 0; let mut s1 = 0; // XOR the input into s0, s1, then multiply and fold. - if len >= 8 { + if bytes.len() >= 8 { s0 = read_u64(bytes, 0); - s1 = read_u64(bytes, len - 8); - } else if len >= 4 { + s1 = read_u64(bytes, bytes.len() - 8); + } else if bytes.len() >= 4 { s0 = read_u32(bytes, 0) as u64; - s1 = read_u32(bytes, len - 4) as u64; - } else if len > 0 { + s1 = read_u32(bytes, bytes.len() - 4) as u64; + } else if bytes.len() > 0 { let lo = bytes[0]; - let mid = bytes[len / 2]; - let hi = bytes[len - 1]; + let mid = bytes[bytes.len() / 2]; + let hi = bytes[bytes.len() - 1]; s0 = hi as u64; s1 = ((lo as u64) << 45) | mid as u64; } @@ -80,10 +80,10 @@ impl Hasher for FoldHasher { // collision studies suggested this or an XOR are sufficient. Moving this to the bottom // of the function appears to improve performance. s0 ^= seeds[0]; - s1 ^= accumulator.wrapping_add(len as u64); + s1 ^= accumulator.wrapping_add(bytes.len() as u64); folded_multiply(s0, s1) - } else if len <= 288 { + } else if bytes.len() <= 288 { // minimising the number of arguments, but self.accumulator and self.seeds can already // be loaded into registers in this function, so passing them directly appears faster rapidhash_core_16_288(accumulator, seeds, bytes) diff --git a/src/lib.rs b/src/lib.rs index 6bd894f..21538b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -224,25 +224,41 @@ const fn rotate_right(x: u64, r: u32) -> u64 { /// A helper method for doing an unaligned 32-bit read from a byte slice. #[inline(always)] fn read_u32(slice: &[u8], offset: usize) -> u32 { - debug_assert!(slice.len() >= 4 + offset); - u32::from_ne_bytes(slice[offset..offset + 4].try_into().unwrap()) - // Uncomment the following to explicitly omit bounds checks for debugging: // debug_assert!(offset as isize >= 0); // debug_assert!(slice.len() >= 4 + offset); // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) } + + // Equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly + let maybe_buf = slice.split_at(offset).1.first_chunk::<4>(); + let buf = match maybe_buf { + Some(buf) => *buf, + None => panic!("read_u32: slice too short"), + }; + u32::from_ne_bytes(buf) } /// A helper method for doing an unaligned 64-bit read from a byte slice. +/// +/// This function is specifically implemented this way to allow the compiler +/// to optimise away the bounds checks. The traditional approach of using +/// `u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())` does +/// not allow the compiler to fully optimise out the bounds checks for +/// unknown reasons. #[inline(always)] fn read_u64(slice: &[u8], offset: usize) -> u64 { - debug_assert!(slice.len() >= 8 + offset); - u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap()) - // Uncomment the following to explicitly omit bounds checks for debugging: // debug_assert!(offset as isize >= 0); // debug_assert!(slice.len() >= 4 + offset); // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) } + + // equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly + let maybe_buf = slice.split_at(offset).1.first_chunk::<8>(); + let buf = match maybe_buf { + Some(buf) => *buf, + None => panic!("read_u64: slice too short"), + }; + u64::from_ne_bytes(buf) } /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16. From 521b73b8c894d3de72c2b972dff6d6dabe62a99b Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Tue, 5 Aug 2025 18:43:19 +0100 Subject: [PATCH 8/8] Replace seed wrapping_add back to rotate_right, confirmed no performance penalty --- src/fast.rs | 3 +-- src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fast.rs b/src/fast.rs index 022ec2b..8c98e1e 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -52,7 +52,6 @@ impl Hasher for FoldHasher { fn write(&mut self, bytes: &[u8]) { let accumulator = self.accumulator; let seeds = self.seeds; - // let len = bytes.len(); // moving self.accumulator outside of this if block improves performance, I'm surprised the // compiler can't do this automatically @@ -80,7 +79,7 @@ impl Hasher for FoldHasher { // collision studies suggested this or an XOR are sufficient. Moving this to the bottom // of the function appears to improve performance. s0 ^= seeds[0]; - s1 ^= accumulator.wrapping_add(bytes.len() as u64); + s1 ^= rotate_right(accumulator, bytes.len() as u32); folded_multiply(s0, s1) } else if bytes.len() <= 288 { diff --git a/src/lib.rs b/src/lib.rs index 21538b8..d636b85 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -317,7 +317,7 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 let mut a = read_u64(data, data.len() - 16); let mut b = read_u64(data, data.len() - 8); - seed = seed.wrapping_add(data.len() as u64); + seed = rotate_right(seed, data.len() as u32); a ^= seeds[2]; b ^= seed; folded_multiply(a, b)