From deb2e926c44f72202531b6f44655b526493c0576 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 13:20:18 +0100
Subject: [PATCH 1/8] Experiments with Hash::write string hashing improvements
 from rapidhash

---
 src/fast.rs | 66 +++++++++++++++++++++++++++--------------------------
 src/lib.rs  | 51 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/src/fast.rs b/src/fast.rs
index a6f0f1e..b185eb4 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -3,7 +3,7 @@
 use core::hash::{BuildHasher, Hasher};
 
 use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed};
-use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, ARBITRARY3};
+use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, rotate_right, ARBITRARY3};
 
 /// A [`Hasher`] instance implementing foldhash, optimized for speed.
 ///
@@ -12,13 +12,13 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, A
 /// [`FixedState`] to create [`FoldHasher`]s.
 #[derive(Clone)]
 pub struct FoldHasher {
-    accumulator: u64,
+    pub(crate) accumulator: u64,
     sponge: u128,
     sponge_len: u8,
-    fold_seed: u64,
-    expand_seed: u64,
-    expand_seed2: u64,
-    expand_seed3: u64,
+    pub(crate) fold_seed: u64,
+    pub(crate) expand_seed: u64,
+    pub(crate) expand_seed2: u64,
+    pub(crate) expand_seed3: u64,
 }
 
 impl FoldHasher {
@@ -62,41 +62,43 @@ impl Hasher for FoldHasher {
         // which costs only a single cycle (or none if executed with
         // instruction-level parallelism).
         let len = bytes.len();
-        let base_seed = rotate_right(self.accumulator, len as u32);
-        if len <= 16 {
-            let mut s0 = base_seed;
-            let mut s1 = self.expand_seed;
+
+        // moving self.accumulator outside of this if block improves performance, I'm surprised the
+        // compiler can't do this automatically
+        self.accumulator = if len <= 16 {
+            let accumulator = self.accumulator;
+            let mut s0 = 0;
+            let mut s1 = 0;
+
             // XOR the input into s0, s1, then multiply and fold.
             if len >= 8 {
-                s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
-                s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
+                s0 = u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
+                s1 = u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
             } else if len >= 4 {
-                s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
-                s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
+                s0 = u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
+                s1 = u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
             } else if len > 0 {
                 let lo = bytes[0];
                 let mid = bytes[len / 2];
                 let hi = bytes[len - 1];
-                s0 ^= lo as u64;
-                s1 ^= ((hi as u64) << 8) | mid as u64;
+                s0 = hi as u64;
+                s1 = ((lo as u64) << 45) | mid as u64;
             }
-            self.accumulator = folded_multiply(s0, s1);
-        } else if len < 256 {
-            self.accumulator = hash_bytes_medium(
-                bytes,
-                base_seed,
-                base_seed.wrapping_add(self.expand_seed),
-                self.fold_seed,
-            );
+
+            // I prefer to wrapping add the length here, as not all platforms have a rotation, and
+            // although it has a smaller impact on the output hash, rapidhash's output quality and
+            // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
+            // of the function appears to improve performance.
+            s0 ^= self.fold_seed;
+            s1 ^= accumulator.wrapping_add(len as u64);
+
+            folded_multiply(s0, s1)
+        } else if len < 256 {  // TODO: could increase to 288?
+            // minimise the number of arguments, let the compiler choose what's best regarding
+            // register allocation, and make the assembly for this branch smaller
+            rapidhash_core_16_288(&self, bytes)
         } else {
-            self.accumulator = hash_bytes_long(
-                bytes,
-                base_seed,
-                base_seed.wrapping_add(self.expand_seed),
-                base_seed.wrapping_add(self.expand_seed2),
-                base_seed.wrapping_add(self.expand_seed3),
-                self.fold_seed,
-            );
+            hash_bytes_long(&self, bytes)
         }
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index ab04556..d11e345 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -114,6 +114,7 @@ pub use seed::SharedSeed;
 mod convenience;
 #[cfg(feature = "std")]
 pub use convenience::*;
+use crate::fast::FoldHasher;
 
 // Arbitrary constants with high entropy. Hexadecimal digits of pi were used.
 const ARBITRARY0: u64 = 0x243f6a8885a308d3;
@@ -246,17 +247,57 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) ->
     s0 ^ s1
 }
 
+#[cold]
+#[must_use]
+fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
+    let mut seed = hasher.accumulator;
+    let mut slice = data;
+
+    if slice.len() > 48 {
+        let mut see1 = seed;
+        let mut see2 = seed;
+
+        while slice.len() >= 48 {
+            seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
+            see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1);
+            see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ hasher.expand_seed3, u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2);
+            let (_, split) = slice.split_at(48);
+            slice = split;
+        }
+
+        seed ^= see1 ^ see2;
+    }
+
+    if slice.len() > 16 {
+        seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
+        if slice.len() > 32 {
+            seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed);
+        }
+    }
+
+    let mut a = u64::from_ne_bytes( data[data.len() - 16..data.len() - 8].try_into().unwrap());
+    let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap());
+
+    seed = seed.wrapping_add(data.len() as u64);
+    a ^= hasher.expand_seed2;
+    b ^= seed;
+    folded_multiply(a, b)
+}
+
 /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
 #[cold]
 #[inline(never)]
 fn hash_bytes_long(
+    hasher: &FoldHasher,
     bytes: &[u8],
-    mut s0: u64,
-    mut s1: u64,
-    mut s2: u64,
-    mut s3: u64,
-    fold_seed: u64,
 ) -> u64 {
+    let base_seed = rotate_right(hasher.accumulator, bytes.len() as u32);
+    let fold_seed = hasher.fold_seed;
+    let mut s0 = base_seed;
+    let mut s1 = base_seed.wrapping_add(hasher.expand_seed);
+    let mut s2 = base_seed.wrapping_add(hasher.expand_seed2);
+    let mut s3 = base_seed.wrapping_add(hasher.expand_seed3);
+
     let chunks = bytes.chunks_exact(64);
     let remainder = chunks.remainder().len();
     for chunk in chunks {

From 6611486864e53af0cf6ac0e99ccf5006e5dd14e4 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 14:02:05 +0100
Subject: [PATCH 2/8] Add rapidhash to the foldhash benchmarks to make sure
 I've not unfairly run the rapidhash benchmarks somehow

---
 Cargo.toml       | 1 +
 benches/bench.rs | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index b6c229d..1821e57 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ criterion = "0.5"
 hashbrown = "0.14"
 uuid = "1.8"
 rand = "0.8"
+rapidhash = "3.0"
 ahash = "0.8"
 fxhash = "0.2"
 chrono = "0.4"
diff --git a/benches/bench.rs b/benches/bench.rs
index 81cc2e1..6a3a1a4 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -147,24 +147,28 @@ fn profile_distr<D: Distribution>(distr: D, map_size: usize, c: &mut Criterion)
 
     profile_hashonly::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), c);
     profile_hashonly::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), c);
+    profile_hashonly::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), c);
     profile_hashonly::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), c);
     profile_hashonly::<ahash::RandomState, _>("ahash", distr.clone(), c);
     profile_hashonly::<std::hash::RandomState, _>("siphash", distr.clone(), c);
 
     profile_lookup_miss::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_lookup_miss::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_lookup_miss::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_lookup_miss::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_lookup_miss::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_lookup_miss::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
 
     profile_lookup_hit::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_lookup_hit::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_lookup_hit::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_lookup_hit::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_lookup_hit::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_lookup_hit::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
 
     profile_set_build::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_set_build::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_set_build::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_set_build::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_set_build::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_set_build::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);

From 48aa9b7404f3c72036f1cb409efb72e8215d97e5 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 14:17:55 +0100
Subject: [PATCH 3/8] Mark medium as inline(never) not cold, medium length
 performance improvement, englishword -5%, url -6%

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index d11e345..619b41f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -247,7 +247,7 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) ->
     s0 ^ s1
 }
 
-#[cold]
+#[inline(never)]
 #[must_use]
 fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
     let mut seed = hasher.accumulator;

From 17229ec8d86fe203747b75b08569135099726e65 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 15:20:21 +0100
Subject: [PATCH 4/8] Store a reference to the seeds array in FoldHasher,
 struuid -24.5%, strdate -38%

---
 src/fast.rs    | 30 ++++++++++++------------------
 src/lib.rs     | 30 +++++++++++++++---------------
 src/quality.rs |  2 +-
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/fast.rs b/src/fast.rs
index b185eb4..82b9ca9 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -13,27 +13,21 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_
 #[derive(Clone)]
 pub struct FoldHasher {
     pub(crate) accumulator: u64,
+    pub(crate) seeds: &'static [u64; 4],
     sponge: u128,
     sponge_len: u8,
-    pub(crate) fold_seed: u64,
-    pub(crate) expand_seed: u64,
-    pub(crate) expand_seed2: u64,
-    pub(crate) expand_seed3: u64,
 }
 
 impl FoldHasher {
     /// Initializes this [`FoldHasher`] with the given per-hasher seed and
     /// [`SharedSeed`].
     #[inline]
-    pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
+    pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
         FoldHasher {
             accumulator: per_hasher_seed,
+            seeds: &shared_seed.seeds,
             sponge: 0,
             sponge_len: 0,
-            fold_seed: shared_seed.seeds[0],
-            expand_seed: shared_seed.seeds[1],
-            expand_seed2: shared_seed.seeds[2],
-            expand_seed3: shared_seed.seeds[3],
         }
     }
 
@@ -43,7 +37,7 @@ impl FoldHasher {
         if self.sponge_len as usize + bits > 128 {
             let lo = self.sponge as u64;
             let hi = (self.sponge >> 64) as u64;
-            self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+            self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
             self.sponge = x.into();
             self.sponge_len = bits as u8;
         } else {
@@ -89,16 +83,16 @@ impl Hasher for FoldHasher {
             // although it has a smaller impact on the output hash, rapidhash's output quality and
             // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
             // of the function appears to improve performance.
-            s0 ^= self.fold_seed;
+            s0 ^= self.seeds[0];
             s1 ^= accumulator.wrapping_add(len as u64);
 
             folded_multiply(s0, s1)
-        } else if len < 256 {  // TODO: could increase to 288?
-            // minimise the number of arguments, let the compiler choose what's best regarding
-            // register allocation, and make the assembly for this branch smaller
-            rapidhash_core_16_288(&self, bytes)
+        } else if len <= 288 {
+            // minimising the number of arguments, but self.accumulator and self.seeds can already
+            // be loaded into registers in this function, so passing them directly appears faster
+            rapidhash_core_16_288(self.accumulator, self.seeds, bytes)
         } else {
-            hash_bytes_long(&self, bytes)
+            hash_bytes_long(self.accumulator, self.seeds, bytes)
         }
     }
 
@@ -126,7 +120,7 @@ impl Hasher for FoldHasher {
     fn write_u128(&mut self, i: u128) {
         let lo = i as u64;
         let hi = (i >> 64) as u64;
-        self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+        self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
     }
 
     #[inline(always)]
@@ -143,7 +137,7 @@ impl Hasher for FoldHasher {
         if self.sponge_len > 0 {
             let lo = self.sponge as u64;
             let hi = (self.sponge >> 64) as u64;
-            folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed)
+            folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0])
         } else {
             self.accumulator
         }
diff --git a/src/lib.rs b/src/lib.rs
index 619b41f..aa500ed 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -248,9 +248,8 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) ->
 }
 
 #[inline(never)]
-#[must_use]
-fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
-    let mut seed = hasher.accumulator;
+fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 {
+    let mut seed = accumulator;
     let mut slice = data;
 
     if slice.len() > 48 {
@@ -258,9 +257,9 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
         let mut see2 = seed;
 
         while slice.len() >= 48 {
-            seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
-            see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1);
-            see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ hasher.expand_seed3, u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2);
+            seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
+            see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1);
+            see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ seeds[3], u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2);
             let (_, split) = slice.split_at(48);
             slice = split;
         }
@@ -269,9 +268,9 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
     }
 
     if slice.len() > 16 {
-        seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ hasher.expand_seed, u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
+        seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
         if slice.len() > 32 {
-            seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ hasher.expand_seed2, u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed);
+            seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed);
         }
     }
 
@@ -279,7 +278,7 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
     let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap());
 
     seed = seed.wrapping_add(data.len() as u64);
-    a ^= hasher.expand_seed2;
+    a ^= seeds[2];
     b ^= seed;
     folded_multiply(a, b)
 }
@@ -288,15 +287,16 @@ fn rapidhash_core_16_288(hasher: &FoldHasher, data: &[u8]) -> u64 {
 #[cold]
 #[inline(never)]
 fn hash_bytes_long(
-    hasher: &FoldHasher,
+    accumulator: u64,
+    seeds: &[u64; 4],
     bytes: &[u8],
 ) -> u64 {
-    let base_seed = rotate_right(hasher.accumulator, bytes.len() as u32);
-    let fold_seed = hasher.fold_seed;
+    let base_seed = rotate_right(accumulator, bytes.len() as u32);
+    let fold_seed = seeds[0];
     let mut s0 = base_seed;
-    let mut s1 = base_seed.wrapping_add(hasher.expand_seed);
-    let mut s2 = base_seed.wrapping_add(hasher.expand_seed2);
-    let mut s3 = base_seed.wrapping_add(hasher.expand_seed3);
+    let mut s1 = base_seed.wrapping_add(seeds[1]);
+    let mut s2 = base_seed.wrapping_add(seeds[2]);
+    let mut s3 = base_seed.wrapping_add(seeds[3]);
 
     let chunks = bytes.chunks_exact(64);
     let remainder = chunks.remainder().len();
diff --git a/src/quality.rs b/src/quality.rs
index 939b60e..ce1dd5a 100644
--- a/src/quality.rs
+++ b/src/quality.rs
@@ -20,7 +20,7 @@ impl FoldHasher {
     /// Initializes this [`FoldHasher`] with the given per-hasher seed and
     /// [`SharedSeed`].
     #[inline(always)]
-    pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
+    pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
         FoldHasher {
             inner: fast::FoldHasher::with_seed(per_hasher_seed, shared_seed),
         }

From 17d5672c936ac6ad6dfb48b5d5d44f5a9086c3fb Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 17:55:44 +0100
Subject: [PATCH 5/8] Add read_u64 and read_u32 helper methods to test bounds
 checks

---
 src/fast.rs | 24 ++++++++++--------------
 src/lib.rs  | 38 +++++++++++++++++++++++++++++++-------
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/src/fast.rs b/src/fast.rs
index 82b9ca9..163f410 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -3,7 +3,7 @@
 use core::hash::{BuildHasher, Hasher};
 
 use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed};
-use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, rotate_right, ARBITRARY3};
+use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, read_u32, read_u64, rotate_right, ARBITRARY3};
 
 /// A [`Hasher`] instance implementing foldhash, optimized for speed.
 ///
@@ -50,27 +50,23 @@ impl FoldHasher {
 impl Hasher for FoldHasher {
     #[inline(always)]
     fn write(&mut self, bytes: &[u8]) {
-        // We perform overlapping reads in the byte hash which could lead to
-        // trivial length-extension attacks. These should be defeated by
-        // adding a length-dependent rotation on our unpredictable seed
-        // which costs only a single cycle (or none if executed with
-        // instruction-level parallelism).
+        let accumulator = self.accumulator;
+        let seeds = self.seeds;
         let len = bytes.len();
 
         // moving self.accumulator outside of this if block improves performance, I'm surprised the
         // compiler can't do this automatically
         self.accumulator = if len <= 16 {
-            let accumulator = self.accumulator;
             let mut s0 = 0;
             let mut s1 = 0;
 
             // XOR the input into s0, s1, then multiply and fold.
             if len >= 8 {
-                s0 = u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
-                s1 = u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
+                s0 = read_u64(bytes, 0);
+                s1 = read_u64(bytes, len - 8);
             } else if len >= 4 {
-                s0 = u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
-                s1 = u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
+                s0 = read_u32(bytes, 0) as u64;
+                s1 = read_u32(bytes, len - 4) as u64;
             } else if len > 0 {
                 let lo = bytes[0];
                 let mid = bytes[len / 2];
@@ -83,16 +79,16 @@ impl Hasher for FoldHasher {
             // although it has a smaller impact on the output hash, rapidhash's output quality and
             // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
             // of the function appears to improve performance.
-            s0 ^= self.seeds[0];
+            s0 ^= seeds[0];
             s1 ^= accumulator.wrapping_add(len as u64);
 
             folded_multiply(s0, s1)
         } else if len <= 288 {
             // minimising the number of arguments, but self.accumulator and self.seeds can already
             // be loaded into registers in this function, so passing them directly appears faster
-            rapidhash_core_16_288(self.accumulator, self.seeds, bytes)
+            rapidhash_core_16_288(accumulator, seeds, bytes)
         } else {
-            hash_bytes_long(self.accumulator, self.seeds, bytes)
+            hash_bytes_long(accumulator, seeds, bytes)
         }
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index aa500ed..6bd894f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -221,6 +221,30 @@ const fn rotate_right(x: u64, r: u32) -> u64 {
     }
 }
 
+/// A helper method for doing an unaligned 32-bit read from a byte slice.
+#[inline(always)]
+fn read_u32(slice: &[u8], offset: usize) -> u32 {
+    debug_assert!(slice.len() >= 4 + offset);
+    u32::from_ne_bytes(slice[offset..offset + 4].try_into().unwrap())
+
+    // Uncomment the following to explicitly omit bounds checks for debugging:
+    // debug_assert!(offset as isize >= 0);
+    // debug_assert!(slice.len() >= 4 + offset);
+    // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) }
+}
+
+/// A helper method for doing an unaligned 64-bit read from a byte slice.
+#[inline(always)]
+fn read_u64(slice: &[u8], offset: usize) -> u64 {
+    debug_assert!(slice.len() >= 8 + offset);
+    u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())
+
+    // Uncomment the following to explicitly omit bounds checks for debugging:
+    // debug_assert!(offset as isize >= 0);
+    // debug_assert!(slice.len() >= 4 + offset);
+    // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) }
+}
+
 /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
 fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 {
     // Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from
@@ -257,9 +281,9 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64
         let mut see2 = seed;
 
         while slice.len() >= 48 {
-            seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
-            see1 = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ see1);
-            see2 = folded_multiply(u64::from_ne_bytes(slice[32..40].try_into().unwrap()) ^ seeds[3], u64::from_ne_bytes(slice[40..48].try_into().unwrap()) ^ see2);
+            seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
+            see1 = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ see1);
+            see2 = folded_multiply(read_u64(slice, 32) ^ seeds[3], read_u64(slice, 40) ^ see2);
             let (_, split) = slice.split_at(48);
             slice = split;
         }
@@ -268,14 +292,14 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64
     }
 
     if slice.len() > 16 {
-        seed = folded_multiply(u64::from_ne_bytes(slice[0..8].try_into().unwrap()) ^ seeds[1], u64::from_ne_bytes(slice[8..16].try_into().unwrap()) ^ seed);
+        seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
         if slice.len() > 32 {
-            seed = folded_multiply(u64::from_ne_bytes(slice[16..24].try_into().unwrap()) ^ seeds[2], u64::from_ne_bytes(slice[24..32].try_into().unwrap()) ^ seed);
+            seed = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ seed);
         }
     }
 
-    let mut a = u64::from_ne_bytes( data[data.len() - 16..data.len() - 8].try_into().unwrap());
-    let mut b = u64::from_ne_bytes( data[data.len() - 8..data.len()].try_into().unwrap());
+    let mut a = read_u64(data, data.len() - 16);
+    let mut b = read_u64(data, data.len() - 8);
 
     seed = seed.wrapping_add(data.len() as u64);
     a ^= seeds[2];

From bd2b1ffc64be13c24958a46d341a8959afa5e222 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 18:28:17 +0100
Subject: [PATCH 6/8] Set codegen-units=1 and incremental=false for consistent
 benchmarking

---
 Cargo.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 1821e57..aa1637a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -43,3 +43,6 @@ harness = false
 
 [profile.release]
 lto = "thin"
+codegen-units = 1
+incremental = false
+debug-assertions = false
\ No newline at end of file

From 6500c8c609cab980886d852f9e2fec3d35a47c8d Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 18:29:04 +0100
Subject: [PATCH 7/8] Re-implement read_u64 and read_u32 in a safe manner that
 the compiler can omit the bounds checks for

---
 src/fast.rs | 22 +++++++++++-----------
 src/lib.rs  | 28 ++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/fast.rs b/src/fast.rs
index 163f410..022ec2b 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -52,25 +52,25 @@ impl Hasher for FoldHasher {
     fn write(&mut self, bytes: &[u8]) {
         let accumulator = self.accumulator;
         let seeds = self.seeds;
-        let len = bytes.len();
+        // let len = bytes.len();
 
         // moving self.accumulator outside of this if block improves performance, I'm surprised the
         // compiler can't do this automatically
-        self.accumulator = if len <= 16 {
+        self.accumulator = if bytes.len() <= 16 {
             let mut s0 = 0;
             let mut s1 = 0;
 
             // XOR the input into s0, s1, then multiply and fold.
-            if len >= 8 {
+            if bytes.len() >= 8 {
                 s0 = read_u64(bytes, 0);
-                s1 = read_u64(bytes, len - 8);
-            } else if len >= 4 {
+                s1 = read_u64(bytes, bytes.len() - 8);
+            } else if bytes.len() >= 4 {
                 s0 = read_u32(bytes, 0) as u64;
-                s1 = read_u32(bytes, len - 4) as u64;
-            } else if len > 0 {
+                s1 = read_u32(bytes, bytes.len() - 4) as u64;
+            } else if bytes.len() > 0 {
                 let lo = bytes[0];
-                let mid = bytes[len / 2];
-                let hi = bytes[len - 1];
+                let mid = bytes[bytes.len() / 2];
+                let hi = bytes[bytes.len() - 1];
                 s0 = hi as u64;
                 s1 = ((lo as u64) << 45) | mid as u64;
             }
@@ -80,10 +80,10 @@ impl Hasher for FoldHasher {
             // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
             // of the function appears to improve performance.
             s0 ^= seeds[0];
-            s1 ^= accumulator.wrapping_add(len as u64);
+            s1 ^= accumulator.wrapping_add(bytes.len() as u64);
 
             folded_multiply(s0, s1)
-        } else if len <= 288 {
+        } else if bytes.len() <= 288 {
             // minimising the number of arguments, but self.accumulator and self.seeds can already
             // be loaded into registers in this function, so passing them directly appears faster
             rapidhash_core_16_288(accumulator, seeds, bytes)
diff --git a/src/lib.rs b/src/lib.rs
index 6bd894f..21538b8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -224,25 +224,41 @@ const fn rotate_right(x: u64, r: u32) -> u64 {
 /// A helper method for doing an unaligned 32-bit read from a byte slice.
 #[inline(always)]
 fn read_u32(slice: &[u8], offset: usize) -> u32 {
-    debug_assert!(slice.len() >= 4 + offset);
-    u32::from_ne_bytes(slice[offset..offset + 4].try_into().unwrap())
-
     // Uncomment the following to explicitly omit bounds checks for debugging:
     // debug_assert!(offset as isize >= 0);
     // debug_assert!(slice.len() >= 4 + offset);
     // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) }
+
+    // Equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<4>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u32: slice too short"),
+    };
+    u32::from_ne_bytes(buf)
 }
 
 /// A helper method for doing an unaligned 64-bit read from a byte slice.
+///
+/// This function is specifically implemented this way to allow the compiler
+/// to optimise away the bounds checks. The traditional approach of using
+/// `u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())` does
+/// not allow the compiler to fully optimise out the bounds checks for
+/// unknown reasons.
 #[inline(always)]
 fn read_u64(slice: &[u8], offset: usize) -> u64 {
-    debug_assert!(slice.len() >= 8 + offset);
-    u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())
-
     // Uncomment the following to explicitly omit bounds checks for debugging:
     // debug_assert!(offset as isize >= 0);
     // debug_assert!(slice.len() >= 4 + offset);
     // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) }
+
+    // equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<8>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u64: slice too short"),
+    };
+    u64::from_ne_bytes(buf)
 }
 
 /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.

From 521b73b8c894d3de72c2b972dff6d6dabe62a99b Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 18:43:19 +0100
Subject: [PATCH 8/8] Replace seed wrapping_add back to rotate_right, confirmed
 no performance penalty

---
 src/fast.rs | 3 +--
 src/lib.rs  | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fast.rs b/src/fast.rs
index 022ec2b..8c98e1e 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -52,7 +52,6 @@ impl Hasher for FoldHasher {
     fn write(&mut self, bytes: &[u8]) {
         let accumulator = self.accumulator;
         let seeds = self.seeds;
-        // let len = bytes.len();
 
         // moving self.accumulator outside of this if block improves performance, I'm surprised the
         // compiler can't do this automatically
@@ -80,7 +79,7 @@ impl Hasher for FoldHasher {
             // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
             // of the function appears to improve performance.
             s0 ^= seeds[0];
-            s1 ^= accumulator.wrapping_add(bytes.len() as u64);
+            s1 ^= rotate_right(accumulator, bytes.len() as u32);
 
             folded_multiply(s0, s1)
         } else if bytes.len() <= 288 {
diff --git a/src/lib.rs b/src/lib.rs
index 21538b8..d636b85 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -317,7 +317,7 @@ fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64
     let mut a = read_u64(data, data.len() - 16);
     let mut b = read_u64(data, data.len() - 8);
 
-    seed = seed.wrapping_add(data.len() as u64);
+    seed = rotate_right(seed, data.len() as u32);
     a ^= seeds[2];
     b ^= seed;
     folded_multiply(a, b)