Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/hll_usage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use datasketches::hll::{HllSketch, HllType};
use datasketches::hll::HllSketch;
use datasketches::hll::HllType;

fn main() {
// Create a new HLL sketch
Expand Down
6 changes: 6 additions & 0 deletions rustfmt.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@

edition = "2024"
reorder_imports = true

comment_width = 120
format_code_in_doc_comments = true
group_imports = "StdExternalCrate"
imports_granularity = "Item"
wrap_comments = true
10 changes: 7 additions & 3 deletions src/hll/array4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@

use super::aux_map::AuxMap;
use crate::error::SerdeError;
use crate::hll::NumStdDev;
use crate::hll::estimator::HipEstimator;
use crate::hll::{NumStdDev, get_slot, get_value};
use crate::hll::get_slot;
use crate::hll::get_value;

const AUX_TOKEN: u8 = 15;

Expand Down Expand Up @@ -288,8 +290,9 @@ impl Array4 {
compact: bool,
ooo: bool,
) -> Result<Self, SerdeError> {
use crate::hll::get_slot;
use crate::hll::get_value;
use crate::hll::serialization::*;
use crate::hll::{get_slot, get_value};

if bytes.len() < HLL_PREAMBLE_SIZE {
return Err(SerdeError::InsufficientData(format!(
Expand Down Expand Up @@ -436,7 +439,8 @@ impl Array4 {
#[cfg(test)]
mod tests {
use super::*;
use crate::hll::{coupon, pack_coupon};
use crate::hll::coupon;
use crate::hll::pack_coupon;

#[test]
fn test_get_set_raw() {
Expand Down
7 changes: 5 additions & 2 deletions src/hll/array6.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
//! cur_min optimization like Array4.

use crate::error::SerdeError;
use crate::hll::NumStdDev;
use crate::hll::estimator::HipEstimator;
use crate::hll::{NumStdDev, get_slot, get_value};
use crate::hll::get_slot;
use crate::hll::get_value;

const VAL_MASK_6: u16 = 0x3F; // 6 bits: 0b0011_1111

Expand Down Expand Up @@ -278,7 +280,8 @@ fn num_bytes_for_k(k: u32) -> usize {
#[cfg(test)]
mod tests {
use super::*;
use crate::hll::{coupon, pack_coupon};
use crate::hll::coupon;
use crate::hll::pack_coupon;

#[test]
fn test_num_bytes_calculation() {
Expand Down
10 changes: 7 additions & 3 deletions src/hll/array8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
//! This provides the maximum value range (0-255) with no bit-packing complexity.

use crate::error::SerdeError;
use crate::hll::NumStdDev;
use crate::hll::estimator::HipEstimator;
use crate::hll::{NumStdDev, get_slot, get_value};
use crate::hll::get_slot;
use crate::hll::get_value;

/// Core Array8 data structure - one byte per slot, no packing
#[derive(Debug, Clone, PartialEq)]
Expand Down Expand Up @@ -218,7 +220,8 @@ impl Array8 {
self.num_zeros = self.bytes.iter().filter(|&&v| v == 0).count() as u32;

// Recompute kxq values from actual register values
// This is essential after bulk merges where registers change but estimator isn't updated incrementally
// This is essential after bulk merges where registers change but estimator isn't updated
// incrementally
let mut kxq0_sum = 0.0;
let mut kxq1_sum = 0.0;

Expand Down Expand Up @@ -342,7 +345,8 @@ impl Array8 {
#[cfg(test)]
mod tests {
use super::*;
use crate::hll::{coupon, pack_coupon};
use crate::hll::coupon;
use crate::hll::pack_coupon;

#[test]
fn test_array8_basic() {
Expand Down
6 changes: 5 additions & 1 deletion src/hll/aux_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
//! Stores slot-value pairs for values that don't fit in the 4-bit main array.
//! Uses open addressing with stride-based probing for collision resolution.

use crate::hll::{RESIZE_DENOMINATOR, RESIZE_NUMERATOR, get_slot, get_value, pack_coupon};
use crate::hll::RESIZE_DENOMINATOR;
use crate::hll::RESIZE_NUMERATOR;
use crate::hll::get_slot;
use crate::hll::get_value;
use crate::hll::pack_coupon;

const ENTRY_EMPTY: u32 = 0;

Expand Down
6 changes: 4 additions & 2 deletions src/hll/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
//! Provides a simple array-based storage for coupons (hash values) with
//! cubic interpolation-based cardinality estimation and confidence bounds.

use crate::hll::coupon_mapping::{X_ARR, Y_ARR};
use crate::hll::COUPON_RSE;
use crate::hll::NumStdDev;
use crate::hll::coupon_mapping::X_ARR;
use crate::hll::coupon_mapping::Y_ARR;
use crate::hll::cubic_interpolation::using_x_and_y_tables;
use crate::hll::{COUPON_RSE, NumStdDev};

/// Sentinel value indicating an empty coupon slot
pub const COUPON_EMPTY: u32 = 0;
Expand Down
12 changes: 7 additions & 5 deletions src/hll/estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,20 @@
//! This is more accurate than the standard HLL estimator, especially for
//! moderate cardinalities.

use crate::hll::{composite_interpolation, cubic_interpolation, harmonic_numbers};
use crate::hll::composite_interpolation;
use crate::hll::cubic_interpolation;
use crate::hll::harmonic_numbers;

/// HIP estimator with KxQ registers for improved cardinality estimation
///
/// This struct encapsulates all estimation-related state and logic,
/// allowing it to be composed into Array4, Array6, and Array8.
///
/// The estimator supports two modes:
/// - **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator
/// for accurate sequential updates
/// - **Out-of-order mode**: Uses composite estimator (raw HLL + linear counting)
/// after deserialization or merging
/// - **In-order mode**: Uses HIP (Historical Inverse Probability) accumulator for accurate
/// sequential updates
/// - **Out-of-order mode**: Uses composite estimator (raw HLL + linear counting) after
/// deserialization or merging
#[derive(Debug, Clone, PartialEq)]
pub struct HipEstimator {
/// HIP estimator accumulator
Expand Down
6 changes: 4 additions & 2 deletions src/hll/hash_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@
//! Provides better performance than List when many coupons are stored.

use crate::error::SerdeError;
use crate::hll::container::{COUPON_EMPTY, Container};
use crate::hll::HllType;
use crate::hll::KEY_MASK_26;
use crate::hll::container::COUPON_EMPTY;
use crate::hll::container::Container;
use crate::hll::serialization::*;
use crate::hll::{HllType, KEY_MASK_26};

/// Hash set for efficient coupon storage with collision handling
#[derive(Debug, Clone, PartialEq)]
Expand Down
3 changes: 2 additions & 1 deletion src/hll/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

use crate::error::SerdeError;
use crate::hll::HllType;
use crate::hll::container::{COUPON_EMPTY, Container};
use crate::hll::container::COUPON_EMPTY;
use crate::hll::container::Container;
use crate::hll::serialization::*;

/// List for sequential coupon storage with duplicate detection
Expand Down
4 changes: 3 additions & 1 deletion src/hll/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ fn coupon<H: Hash>(v: H) -> u32 {

#[cfg(test)]
mod tests {
use crate::hll::{get_slot, get_value, pack_coupon};
use crate::hll::get_slot;
use crate::hll::get_value;
use crate::hll::pack_coupon;

#[test]
fn test_pack_unpack_coupon() {
Expand Down
6 changes: 5 additions & 1 deletion src/hll/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,19 @@
use std::hash::Hash;

use crate::error::SerdeError;
use crate::hll::HllType;
use crate::hll::NumStdDev;
use crate::hll::RESIZE_DENOMINATOR;
use crate::hll::RESIZE_NUMERATOR;
use crate::hll::array4::Array4;
use crate::hll::array6::Array6;
use crate::hll::array8::Array8;
use crate::hll::container::Container;
use crate::hll::coupon;
use crate::hll::hash_set::HashSet;
use crate::hll::list::List;
use crate::hll::mode::Mode;
use crate::hll::serialization::*;
use crate::hll::{HllType, NumStdDev, RESIZE_DENOMINATOR, RESIZE_NUMERATOR, coupon};

/// A HyperLogLog sketch.
///
Expand Down
14 changes: 9 additions & 5 deletions src/hll/union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,16 @@
//! - Different modes (List, Set, Array4/6/8)
//! - Different target HLL types

use std::hash::Hash;

use crate::hll::HllSketch;
use crate::hll::HllType;
use crate::hll::NumStdDev;
use crate::hll::array4::Array4;
use crate::hll::array6::Array6;
use crate::hll::array8::Array8;
use crate::hll::mode::Mode;
use crate::hll::{HllSketch, HllType, NumStdDev, pack_coupon};
use std::hash::Hash;
use crate::hll::pack_coupon;

/// An HLL Union for combining multiple HLL sketches.
///
Expand All @@ -55,9 +59,9 @@ impl HllUnion {
///
/// # Arguments
///
/// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4, 21].
/// This determines the maximum precision the union can handle. Input sketches
/// with larger lg_k will be down-sampled.
/// * `lg_max_k` - Maximum log2 of the number of buckets. Must be in [4, 21]. This determines
/// the maximum precision the union can handle. Input sketches with larger lg_k will be
/// down-sampled.
///
/// # Panics
///
Expand Down
10 changes: 7 additions & 3 deletions src/tdigest/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
// specific language governing permissions and limitations
// under the License.

use crate::error::SerdeError;
use crate::tdigest::serialization::*;
use byteorder::{BE, LE, ReadBytesExt};
use std::cmp::Ordering;
use std::convert::identity;
use std::io::Cursor;
use std::num::NonZeroU64;

use byteorder::BE;
use byteorder::LE;
use byteorder::ReadBytesExt;

use crate::error::SerdeError;
use crate::tdigest::serialization::*;

/// The default value of K if one is not specified.
const DEFAULT_K: u16 = 200;
/// Multiplier for buffer size relative to centroids capacity.
Expand Down
5 changes: 4 additions & 1 deletion tests/hll_union_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@
//!
//! This mirrors the testing strategy used in hll_update_test.rs

use datasketches::hll::{HllSketch, HllType, HllUnion, NumStdDev};
use datasketches::hll::HllSketch;
use datasketches::hll::HllType;
use datasketches::hll::HllUnion;
use datasketches::hll::NumStdDev;

#[test]
fn test_union_basic_operations() {
Expand Down
4 changes: 3 additions & 1 deletion tests/hll_update_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use datasketches::hll::{HllSketch, HllType, NumStdDev};
use datasketches::hll::HllSketch;
use datasketches::hll::HllType;
use datasketches::hll::NumStdDev;

#[test]
fn test_basic_update() {
Expand Down
3 changes: 2 additions & 1 deletion tests/tdigest_serialization_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ use common::serialization_test_data;
use common::test_data;
use datasketches::tdigest::TDigestMut;
use googletest::assert_that;
use googletest::prelude::{eq, near};
use googletest::prelude::eq;
use googletest::prelude::near;

fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) {
let bytes = fs::read(&path).unwrap();
Expand Down
3 changes: 2 additions & 1 deletion tests/tdigest_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

use datasketches::tdigest::TDigestMut;
use googletest::assert_that;
use googletest::prelude::{eq, near};
use googletest::prelude::eq;
use googletest::prelude::near;

#[test]
fn test_empty() {
Expand Down