diff --git a/Cargo.toml b/Cargo.toml index 309eed4..46c38a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.1" description = "A library for performing Content-Defined Chunking (CDC) on data streams." readme = "README.md" license = "MIT" +edition = "2021" authors = ["Vincent Cantin "] homepage = "https://github.com/green-coder/cdc" diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index a4c8981..4b05622 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -1,22 +1,34 @@ -extern crate cdc; -extern crate criterion; - use cdc::{Rabin64, RollingHash64}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -pub fn slide_benchmarks(c: &mut Criterion) { - for i in [1_000, 10_000, 100_000] { - c.bench_function(&format!("slide {}x", i), |b| { - let data: u8 = 16; //arbitrary value - b.iter(|| { - let mut rabin = Rabin64::new(5); - for _ in 0..i { - rabin.slide(&data) - } - }) +fn slide_benchmarks(c: &mut Criterion) { + let mut group = c.benchmark_group("slide"); + let data = 16; + for size in [1_000, 10_000, 100_000] { + group.throughput(Throughput::Bytes(size)); + group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| { + b.iter_batched( + || Rabin64::new(5), + |mut rabin| { + for _ in 0..size { + rabin.slide(black_box(&data)); + } + }, + criterion::BatchSize::SmallInput, + ); }); } } -criterion_group!(benches, slide_benchmarks); +fn create_benchmarks(c: &mut Criterion) { + c.bench_function("new", |b| { + b.iter(|| Rabin64::new(5)); + }); + + c.bench_function("with_polynom", |b| { + b.iter(|| Rabin64::new_with_polynom(5, &0x3847fe406c36e1)); + }); +} + +criterion_group!(benches, slide_benchmarks, create_benchmarks); criterion_main!(benches); diff --git a/examples/chunk.rs b/examples/chunk.rs index 41edfef..69aa083 100644 --- a/examples/chunk.rs +++ b/examples/chunk.rs @@ -1,5 +1,3 @@ -extern crate cdc; - use std::cmp::{max, min}; use std::fs::File; use std::io; diff --git a/examples/separator.rs b/examples/separator.rs index 8d6b54b..38803aa 100644 --- a/examples/separator.rs +++ b/examples/separator.rs @@ -1,5 +1,3 @@ -extern crate cdc; - use std::fs::File; use std::io; use std::io::prelude::*; diff --git a/examples/tree01.rs b/examples/tree01.rs index defb5bb..22ab50a 100644 --- a/examples/tree01.rs +++ b/examples/tree01.rs @@ -1,22 +1,17 @@ -extern crate cdc; - +use std::sync::atomic::{AtomicU32, Ordering}; use cdc::*; type IntHash = u32; -static mut HASH_ID: IntHash = 0; +static HASH_ID: AtomicU32 = AtomicU32::new(0); fn get_new_hash_id() -> IntHash { - unsafe { - let id = HASH_ID; - HASH_ID += 1; - id - } + HASH_ID.fetch_add(1, Ordering::Relaxed) } fn my_new_node(level: usize, children: &Vec) -> Node { Node { hash: get_new_hash_id(), - level: level, + level, children: children.clone(), } } @@ -29,9 +24,7 @@ fn main() { level: *level, }); - unsafe { - HASH_ID = levels.len() as IntHash; - } + HASH_ID.store(levels.len() as _, Ordering::Relaxed); for node in NodeIter::new(hashed_chunk_it, my_new_node, 0) { println!("{:?}", node); diff --git a/examples/tree02.rs b/examples/tree02.rs index f382232..7c056a9 100644 --- a/examples/tree02.rs +++ b/examples/tree02.rs @@ -1,6 +1,3 @@ -extern crate cdc; -extern crate ring; - #[macro_use] extern crate arrayref; @@ -18,10 +15,7 @@ pub struct DigestReader { impl DigestReader { pub fn new(inner: R, digest: digest::Context) -> DigestReader { - DigestReader { - inner: inner, - digest: digest, - } + DigestReader { inner, digest } } } @@ -48,11 +42,11 @@ fn new_hash_node(level: usize, children: &Vec) -> Node { ctx.update(child); } let digest = ctx.finish(); - let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone(); + let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8]; Node { - hash: hash, - level: level, + hash, + level, children: children.clone(), } } @@ -79,15 +73,12 @@ fn chunk_file(path: &String) -> io::Result<()> { digest_reader.digest.update(&[0u8]); // To mark that it is a chunk, not a node. io::copy(&mut digest_reader, &mut io::sink()).unwrap(); let digest = digest_reader.digest.finish(); - let hash: Hash256 = array_ref![digest.as_ref(), 0, 256 / 8].clone(); + let hash: Hash256 = *array_ref![digest.as_ref(), 0, 256 / 8]; // Calculates the level of the separators. let level = HashToLevel::custom_new(13, 3).to_level(chunk.separator_hash); - HashedChunk { - hash: hash, - level: level, - } + HashedChunk { hash, level } }); // Builds a tree of hash nodes. diff --git a/src/lib.rs b/src/lib.rs index 8c376b3..a22a5ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,8 @@ mod rolling_hash; mod separator; mod tree; -pub use chunk::{Chunk, ChunkIter}; -pub use polynom::{Polynom, Polynom64}; -pub use rolling_hash::{Rabin64, RollingHash64}; -pub use separator::{HashToLevel, Separator, SeparatorIter}; -pub use tree::{HashedChunk, Node, NodeIter}; +pub use crate::chunk::{Chunk, ChunkIter}; +pub use crate::polynom::{Polynom, Polynom64}; +pub use crate::rolling_hash::{Rabin64, RollingHash64}; +pub use crate::separator::{HashToLevel, Separator, SeparatorIter}; +pub use crate::tree::{HashedChunk, Node, NodeIter}; diff --git a/src/rolling_hash.rs b/src/rolling_hash.rs index ca5fa08..4c80299 100644 --- a/src/rolling_hash.rs +++ b/src/rolling_hash.rs @@ -2,12 +2,23 @@ use super::{Polynom, Polynom64}; pub trait RollingHash64 { fn reset(&mut self); - fn prefill_window(&mut self, iter: &mut I) -> usize + + /// Attempt to fills the window - 1 byte. + fn prefill_window(&mut self, iter: I) -> usize where I: Iterator; - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize + + /// Combine a reset, and prefill_window + /// + /// This should have the same effect as calling reset() and prefill_window(), + /// but an implementation may be able to do so more efficiently. + fn reset_and_prefill_window(&mut self, iter: I) -> usize where - I: Iterator; + I: Iterator, + { + self.reset(); + self.prefill_window(iter) + } fn slide(&mut self, byte: &u8); fn get_hash(&self) -> &Polynom64; } @@ -32,7 +43,7 @@ pub struct Rabin64 { pub const MOD_POLYNOM: Polynom64 = 0x3DA3358B4DC173; impl Rabin64 { - pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] { + fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] { let mut out_table = [0; 256]; for (b, elem) in out_table.iter_mut().enumerate() { let mut hash = (b as Polynom64).modulo(mod_polynom); @@ -46,7 +57,7 @@ impl Rabin64 { out_table } - pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] { + fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] { let mut mod_table = [0; 256]; let k = mod_polynom.degree(); for (b, elem) in mod_table.iter_mut().enumerate() { @@ -57,11 +68,13 @@ impl Rabin64 { mod_table } - pub fn new(window_size_nb_bits: u32) -> Rabin64 { + pub fn new(window_size_nb_bits: u32) -> Self { Self::new_with_polynom(window_size_nb_bits, &MOD_POLYNOM) } - pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 { + pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Self { + // We don't really want to allocate 4 GiB of memory for the window. + assert!(window_size_nb_bits < 32); let window_size = 1 << window_size_nb_bits; let window_data = vec![0; window_size]; @@ -83,15 +96,14 @@ impl Rabin64 { for v in bytes { self.hash <<= 8; self.hash |= *v as Polynom64; - self.hash = self.hash.modulo(&mod_polynom); + self.hash = self.hash.modulo(mod_polynom); } } } impl RollingHash64 for Rabin64 { fn reset(&mut self) { - self.window_data.clear(); - self.window_data.resize(self.window_size, 0); + self.window_data.fill(0); self.window_index = 0; self.hash = 0; @@ -99,52 +111,40 @@ impl RollingHash64 for Rabin64 { // self.slide(1); } - // Attempt to fills the window - 1 byte. - fn prefill_window(&mut self, iter: &mut I) -> usize + fn prefill_window(&mut self, iter: I) -> usize where I: Iterator, { let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - self.slide(&b); - nb_bytes_read += 1; - } - None => break, - } + for byte in iter.take(self.window_size - 1) { + self.slide(&byte); + nb_bytes_read += 1; } nb_bytes_read } - // Combines a reset with a prefill in an optimized way. - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize + fn reset_and_prefill_window(&mut self, iter: I) -> usize where I: Iterator, { self.hash = 0; let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - // Take the old value out of the window and the hash. - // ... let's suppose that the buffer contains zeroes, do nothing. - - // Put the new value in the window and in the hash. - self.window_data[self.window_index] = b; - let mod_index = (self.hash >> self.polynom_shift) & 255; - self.hash <<= 8; - self.hash |= b as Polynom64; - self.hash ^= self.mod_table[mod_index as usize]; - - // Move the windowIndex to the next position. - self.window_index = (self.window_index + 1) & self.window_size_mask; - - nb_bytes_read += 1; - } - None => break, - } + for b in iter.take(self.window_size - 1) { + // Take the old value out of the window and the hash. + // ... let's suppose that the buffer contains zeroes, do nothing. + + // Put the new value in the window and in the hash. + self.window_data[self.window_index] = b; + let mod_index = (self.hash >> self.polynom_shift) & 255; + self.hash <<= 8; + self.hash |= b as Polynom64; + self.hash ^= self.mod_table[mod_index as usize]; + + // Move the windowIndex to the next position. + self.window_index = (self.window_index + 1) & self.window_size_mask; + + nb_bytes_read += 1; } // Because we didn't overwrite that element in the loop above. diff --git a/src/separator.rs b/src/separator.rs index e6f9a97..5c5ecf6 100644 --- a/src/separator.rs +++ b/src/separator.rs @@ -113,6 +113,12 @@ impl HashToLevel { } } +impl Default for HashToLevel { + fn default() -> Self { + Self::new() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/tree.rs b/src/tree.rs index 8db9006..e98c351 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -1,5 +1,5 @@ -/// Example of type to use with the generic structures below. -//pub type Hash256 = [u8; 256/8]; +// Example of type to use with the generic structures below. +// pub type Hash256 = [u8; 256/8]; #[derive(Debug)] pub struct HashedChunk {