From bc07377f02cd25419536d1d664e32593ec868cc8 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 27 Apr 2023 12:42:55 -0700 Subject: [PATCH 01/72] some basic design --- Cargo.toml | 1 + src/genome_chunked_array.rs | 42 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 3 files changed, 44 insertions(+) create mode 100644 src/genome_chunked_array.rs diff --git a/Cargo.toml b/Cargo.toml index dca652a..6fea19e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ forrustts = {version = "0.3.0-alpha.0", git = "https://github.com/ForwardSimulat rand = "0.8.5" rand_distr = "0.4.3" clap = { version = "4.2.1", features = ["derive"] } +rclite = "0.2.2" [dev-dependencies] proptest = "1.1.0" diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs new file mode 100644 index 0000000..20c5e19 --- /dev/null +++ b/src/genome_chunked_array.rs @@ -0,0 +1,42 @@ +// Mutations will be stored in blocks of 8 +// indexes +static CHUNK_SIZE: u32 = 8; + +struct Chunk { + start: u32, + stop: u32, +} + +struct MutationChunks { + mutations: Vec, + chunks: Vec>, +} + +struct Individuals { + chunks: Vec>, + starts: Vec, + stops: Vec, +} + +#[cfg(test)] +mod sinful_tests { + use std::num::NonZeroU32; + + use super::*; + + #[test] + fn test_sizes() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::>() + ); + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::>>() + ); + assert_eq!( + std::mem::size_of::>(), + std::mem::size_of::() + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index d1867b8..1b548f1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ mod common; pub mod fwdpp_copy; pub mod genome_array; +pub mod genome_chunked_array; // Public "API" for examples pub use common::SimParams; From 28978ad641bef209279b7446a0065538f8e048b6 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 09:47:26 -0700 Subject: [PATCH 02/72] For a first pass, let's do chunks of 64 mutations --- src/genome_chunked_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 20c5e19..b9bd6e9 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -1,6 +1,6 @@ -// Mutations will be stored in blocks of 8 +// Mutations will be stored in blocks of 64 // indexes -static CHUNK_SIZE: u32 = 8; +static CHUNK_SIZE: u32 = 64; struct Chunk { start: u32, From 78af04c1bb0541ea8642ac056edabb30a753151b Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 09:53:33 -0700 Subject: [PATCH 03/72] Renaming --- src/genome_chunked_array.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index b9bd6e9..5938115 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -9,13 +9,13 @@ struct Chunk { struct MutationChunks { mutations: Vec, - chunks: Vec>, + chunks: Vec, } -struct Individuals { - chunks: Vec>, - starts: Vec, - stops: Vec, +struct Genomes { + chunks: Vec, // each genome is a range of chunk indexes + starts: Vec, // For each genome, where is its first chunk? + stops: Vec, // One past last chunk such that a genome is chunks[starts[i]..stops[i]] } #[cfg(test)] From 3279200664ab66e0a6bf83fcf4a2ce6a7d8d6f25 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 10:28:44 -0700 Subject: [PATCH 04/72] some musings --- src/genome_chunked_array.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 5938115..ec85cc6 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -1,3 +1,5 @@ +use crate::common::Mutation; + // Mutations will be stored in blocks of 64 // indexes static CHUNK_SIZE: u32 = 64; @@ -8,8 +10,9 @@ struct Chunk { } struct MutationChunks { - mutations: Vec, - chunks: Vec, + mutations: Vec, // indexes into mutation vector. + // u32::MAX is treated as a NULL "sentinel" + chunks: Vec, // Is this needed? } struct Genomes { @@ -39,4 +42,11 @@ mod sinful_tests { std::mem::size_of::() ); } + + #[test] + fn test_non_zero_int_types() { + let x = NonZeroU32::new(1).unwrap(); + let y = x.checked_add(1).unwrap(); + assert_eq!(y, 2.try_into().unwrap()); + } } From f3b3badf0d18372f514ef667b4ca212fad11d882 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 10:38:40 -0700 Subject: [PATCH 05/72] more musings --- src/genome_chunked_array.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index ec85cc6..ce48f60 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -2,7 +2,7 @@ use crate::common::Mutation; // Mutations will be stored in blocks of 64 // indexes -static CHUNK_SIZE: u32 = 64; +const CHUNK_SIZE: u32 = 64; struct Chunk { start: u32, @@ -11,8 +11,12 @@ struct Chunk { struct MutationChunks { mutations: Vec, // indexes into mutation vector. - // u32::MAX is treated as a NULL "sentinel" - chunks: Vec, // Is this needed? + // u32::MAX is treated as a NULL "sentinel" + chunks: Vec, // Is this needed? +} + +impl MutationChunks { + const CHUNK_SIZE: u32 = CHUNK_SIZE; // Maybe this should be here? } struct Genomes { From 5deda3166b12039e1f78a4d1bdd9aaecc22c4398 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 10:45:23 -0700 Subject: [PATCH 06/72] baby steps to tests --- src/genome_chunked_array.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index ce48f60..65c0630 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -25,6 +25,22 @@ struct Genomes { stops: Vec, // One past last chunk such that a genome is chunks[starts[i]..stops[i]] } +impl Genomes {} + +#[cfg(test)] +mod development_tests { + use super::Genomes; + + #[test] + fn add_mutations() { + let mut genomes = Genomes { + chunks: vec![], + starts: vec![], + stops: vec![], + }; + } +} + #[cfg(test)] mod sinful_tests { use std::num::NonZeroU32; From cd32eee76e5a263099ba39bb183429f0b438abab Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 14:48:38 -0700 Subject: [PATCH 07/72] more about what we actually need --- src/genome_chunked_array.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 65c0630..f276467 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -2,7 +2,7 @@ use crate::common::Mutation; // Mutations will be stored in blocks of 64 // indexes -const CHUNK_SIZE: u32 = 64; +const CHUNK_SIZE: usize = 64; struct Chunk { start: u32, @@ -16,7 +16,7 @@ struct MutationChunks { } impl MutationChunks { - const CHUNK_SIZE: u32 = CHUNK_SIZE; // Maybe this should be here? + const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? } struct Genomes { @@ -25,19 +25,39 @@ struct Genomes { stops: Vec, // One past last chunk such that a genome is chunks[starts[i]..stops[i]] } +// What we really need: +// 1. Do we need a new chunk? +// 2. If yes, push (or recycle) one. +// 3. If no, fill what we can of the current (last?) chunk. +fn add_mutations(mutation_keys: &[u32], genome: usize, genomes: &mut Genomes) { + genomes.starts.push(0); + genomes.stops.push(CHUNK_SIZE); + genomes.chunks.extend_from_slice(mutation_keys); + for _ in 0..CHUNK_SIZE - mutation_keys.len() { + genomes.chunks.push(u32::MAX); + } +} + impl Genomes {} #[cfg(test)] mod development_tests { - use super::Genomes; + use super::*; #[test] - fn add_mutations() { + fn test_add_mutations() { let mut genomes = Genomes { chunks: vec![], starts: vec![], stops: vec![], }; + + let mutations = [1, 2, 3]; + add_mutations(&mutations, 0, &mut genomes); + assert_eq!(genomes.starts.len(), 1); + assert_eq!(genomes.stops.len(), 1); + assert_eq!(genomes.stops[0] - genomes.starts[0], CHUNK_SIZE); + assert_eq!(genomes.chunks.len(), CHUNK_SIZE); } } From cd91cb4699d16a225b6282223abe11a7b35495b7 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 15:16:06 -0700 Subject: [PATCH 08/72] more design notes --- src/genome_chunked_array.rs | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index f276467..e28e70f 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -25,16 +25,39 @@ struct Genomes { stops: Vec, // One past last chunk such that a genome is chunks[starts[i]..stops[i]] } +impl Genomes { + fn is_empty(&self) -> bool { + assert_eq!(self.starts.len(), self.stops.len()); + self.starts.is_empty() + } +} + // What we really need: // 1. Do we need a new chunk? // 2. If yes, push (or recycle) one. // 3. If no, fill what we can of the current (last?) chunk. -fn add_mutations(mutation_keys: &[u32], genome: usize, genomes: &mut Genomes) { - genomes.starts.push(0); - genomes.stops.push(CHUNK_SIZE); - genomes.chunks.extend_from_slice(mutation_keys); - for _ in 0..CHUNK_SIZE - mutation_keys.len() { - genomes.chunks.push(u32::MAX); +// Actually, a lot of this is wrong: +// 1. We don't know what a "genome" is, yet, in memory. +// 2. So, are we adding to a genome, updating an existing +// genome, or what? +// So what we actually really need is: +// 1. To know the parental genome identifiers +// 2. To be able to get new chunks, which means that MutationChunks +// has to be part of the conversation. +// 3. Need partition searching to copy parental stuff over. +// 4. Etc.. +fn add_mutations( + mutation_keys: &[u32], // The indexes of the new mutations + genome: usize, // The index of the genome that will "get" the new mutations + genomes: &mut Genomes, // The output +) { + if genomes.is_empty() { + genomes.starts.push(0); + genomes.stops.push(CHUNK_SIZE); + genomes.chunks.extend_from_slice(mutation_keys); + for _ in 0..CHUNK_SIZE - mutation_keys.len() { + genomes.chunks.push(u32::MAX); + } } } From 41ccf858c99bc4c17ec418860438001441788034 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 16:19:14 -0700 Subject: [PATCH 09/72] some infrastructure --- src/genome_chunked_array.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index e28e70f..d0db46a 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -9,14 +9,39 @@ struct Chunk { stop: u32, } +#[derive(Default)] struct MutationChunks { mutations: Vec, // indexes into mutation vector. // u32::MAX is treated as a NULL "sentinel" chunks: Vec, // Is this needed? + counts: Vec, + queue: Vec, } impl MutationChunks { const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? + + fn new_chunk_mut(&mut self) -> (usize, &mut [u32]) { + assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); + + match self.queue.pop() { + Some(index) => { + let mslice = + &mut self.mutations[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE]; + mslice.fill(u32::MAX); + (index, mslice) + } + None => { + let id = self.mutations.len() / Self::CHUNK_SIZE; + self.mutations + .resize(self.mutations.len() + Self::CHUNK_SIZE, u32::MAX); + ( + id, + &mut self.mutations[id * CHUNK_SIZE..id * CHUNK_SIZE + CHUNK_SIZE], + ) + } + } + } } struct Genomes { @@ -46,7 +71,7 @@ impl Genomes { // has to be part of the conversation. // 3. Need partition searching to copy parental stuff over. // 4. Etc.. -fn add_mutations( +fn generate_offspring_genome( mutation_keys: &[u32], // The indexes of the new mutations genome: usize, // The index of the genome that will "get" the new mutations genomes: &mut Genomes, // The output @@ -76,7 +101,7 @@ mod development_tests { }; let mutations = [1, 2, 3]; - add_mutations(&mutations, 0, &mut genomes); + generate_offspring_genome(&mutations, 0, &mut genomes); assert_eq!(genomes.starts.len(), 1); assert_eq!(genomes.stops.len(), 1); assert_eq!(genomes.stops[0] - genomes.starts[0], CHUNK_SIZE); From 2a4ace8f5966c7122afbba10954fcf85e8bcd844 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 16:32:31 -0700 Subject: [PATCH 10/72] on the wrong path, I think? --- src/genome_chunked_array.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index d0db46a..b0b39c2 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -55,6 +55,10 @@ impl Genomes { assert_eq!(self.starts.len(), self.stops.len()); self.starts.is_empty() } + + fn genome(&self, index: usize) -> &[u32] { + &self.chunks[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE] + } } // What we really need: @@ -73,9 +77,18 @@ impl Genomes { // 4. Etc.. fn generate_offspring_genome( mutation_keys: &[u32], // The indexes of the new mutations - genome: usize, // The index of the genome that will "get" the new mutations - genomes: &mut Genomes, // The output + // NOTE: we are already bowing + // to the borrow checker: + // we cannot have parental/offspring + // genomes in the same container. + // Do "chunks" and "genomes" need to + // be in the same struct? + parents: (&[u32], &[u32]), // parental genomes + mutation_chunks: &mut MutationChunks, // Output chunks + genomes: &mut Genomes, // Output genomes ) { + let parent_one_genome = parents.0; + let parent_two_genome = parents.1; if genomes.is_empty() { genomes.starts.push(0); genomes.stops.push(CHUNK_SIZE); @@ -100,8 +113,9 @@ mod development_tests { stops: vec![], }; + let mut chunks = MutationChunks::default(); let mutations = [1, 2, 3]; - generate_offspring_genome(&mutations, 0, &mut genomes); + generate_offspring_genome(&mutations, (&[], &[]), &mut chunks, &mut genomes); assert_eq!(genomes.starts.len(), 1); assert_eq!(genomes.stops.len(), 1); assert_eq!(genomes.stops[0] - genomes.starts[0], CHUNK_SIZE); From 3185bbd2b5db4348117adc21ee3f8648255948eb Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 1 May 2023 17:08:55 -0700 Subject: [PATCH 11/72] thoughts on what and OO version could look like --- src/genome_chunked_array.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index b0b39c2..2ba4879 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -4,6 +4,39 @@ use crate::common::Mutation; // indexes const CHUNK_SIZE: usize = 64; +// Pseudocode of what an OO +// version may look like +mod pseudocode { + struct Genome { + chunks: Vec, // indexes into DiploidPopulation::mutation_chunks::chunks + count: u32, + } + + struct MutationChunk { + mutations: [u32; super::CHUNK_SIZE], // indexes into DiploidPopulation::mutations + count: u32, + } + + impl MutationChunk { + fn new_empty() -> Self { + Self { + mutations: [u32::MAX; super::CHUNK_SIZE], + count: 0, + } + } + } + + struct MutationChunks { + chunks: Vec, + } + + struct DiploidPopulation { + genomes: Vec, + mutation_chunks: MutationChunks, + mutations: Vec, + } +} + struct Chunk { start: u32, stop: u32, From 10f9d76871872ce8d1af2c83f108a4cef0c21f51 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 12:00:42 -0700 Subject: [PATCH 12/72] arrayref using std. yay --- src/genome_chunked_array.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 2ba4879..fed9793 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -184,4 +184,34 @@ mod sinful_tests { let y = x.checked_add(1).unwrap(); assert_eq!(y, 2.try_into().unwrap()); } + + //#[test] + //fn test_arrayref() { + // fn mod_array(x: &mut [i32; 3]) { + // x[2] = 4; + // } + // use arrayref::array_mut_ref; + // let mut x = vec![1, 2, 3]; + // mod_array(array_mut_ref![x, 0, 3]); + // assert_eq!(x[2], 4); + //} + + #[test] + fn test_slice_try_into_array() { + let mut v = vec![1; 64]; + let vs = &mut v[0..32]; + let s: &mut [i32; 32] = vs.try_into().unwrap(); + s[10] = 2; + assert_eq!(v[10], 2); + let vs = &mut v[0..64]; + let s: &mut [i32; 64] = vs.try_into().unwrap(); + s[10] = 3; + assert_eq!(v[10], 3); + + let mut v = vec![1; 256]; + let vs = &mut v[9..9 + 64]; + let s: &mut [i32; 64] = vs.try_into().unwrap(); + s[1] = 3; + assert_eq!(v[10], 3); + } } From fac8e508b37492dec7cfc590cd4eda821a918b41 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 13:42:47 -0700 Subject: [PATCH 13/72] some cool tricks. some terrible design --- src/genome_chunked_array.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index fed9793..30dfa29 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -43,7 +43,7 @@ struct Chunk { } #[derive(Default)] -struct MutationChunks { +pub struct MutationChunks { mutations: Vec, // indexes into mutation vector. // u32::MAX is treated as a NULL "sentinel" chunks: Vec, // Is this needed? @@ -54,6 +54,16 @@ struct MutationChunks { impl MutationChunks { const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? + pub fn chunk(&self, at: usize) -> &[u32; CHUNK_SIZE] { + let s = &self.mutations[at * CHUNK_SIZE..at * CHUNK_SIZE + CHUNK_SIZE]; + s.try_into().unwrap() + } + + pub fn chunk_mut(&mut self, at: usize) -> &mut [u32; CHUNK_SIZE] { + let s = &mut self.mutations[at * CHUNK_SIZE..at * CHUNK_SIZE + CHUNK_SIZE]; + s.try_into().unwrap() + } + fn new_chunk_mut(&mut self) -> (usize, &mut [u32]) { assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); @@ -214,4 +224,25 @@ mod sinful_tests { s[1] = 3; assert_eq!(v[10], 3); } + + #[test] + fn test_copy_free_array_from_struct() { + struct X { + x: Vec, + } + + impl X { + fn chunk(&mut self) -> &mut [i32; 4] { + let s = &mut self.x[0..4]; + s.try_into().unwrap() + } + } + + let mut x = X { + x: vec![0, 1, 2, 3], + }; + let c = x.chunk(); + c[0] += 10; + assert_eq!(x.x[0], 10); + } } From 689c12aac387df38932df8c0a9f3359ab175876f Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 13:52:34 -0700 Subject: [PATCH 14/72] some code re-use --- src/genome_chunked_array.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 30dfa29..e11b1db 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -64,24 +64,17 @@ impl MutationChunks { s.try_into().unwrap() } - fn new_chunk_mut(&mut self) -> (usize, &mut [u32]) { + // The None path may not be the most efficient + fn new_chunk_mut(&mut self) -> (usize, &mut [u32; CHUNK_SIZE]) { assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); match self.queue.pop() { - Some(index) => { - let mslice = - &mut self.mutations[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE]; - mslice.fill(u32::MAX); - (index, mslice) - } + Some(index) => (index, self.chunk_mut(index)), None => { let id = self.mutations.len() / Self::CHUNK_SIZE; self.mutations .resize(self.mutations.len() + Self::CHUNK_SIZE, u32::MAX); - ( - id, - &mut self.mutations[id * CHUNK_SIZE..id * CHUNK_SIZE + CHUNK_SIZE], - ) + (id, self.chunk_mut(id)) } } } From a6bb53252d9e6670e2d6846e7be4fd9961e7107c Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 14:31:31 -0700 Subject: [PATCH 15/72] use Self::CHUNK_SIZE --- src/genome_chunked_array.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index e11b1db..328adb2 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -54,20 +54,20 @@ pub struct MutationChunks { impl MutationChunks { const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? - pub fn chunk(&self, at: usize) -> &[u32; CHUNK_SIZE] { - let s = &self.mutations[at * CHUNK_SIZE..at * CHUNK_SIZE + CHUNK_SIZE]; + pub fn chunk(&self, at: usize) -> &[u32; Self::CHUNK_SIZE] { + let s = &self.mutations[at * Self::CHUNK_SIZE..at * Self::CHUNK_SIZE + Self::CHUNK_SIZE]; s.try_into().unwrap() } - pub fn chunk_mut(&mut self, at: usize) -> &mut [u32; CHUNK_SIZE] { - let s = &mut self.mutations[at * CHUNK_SIZE..at * CHUNK_SIZE + CHUNK_SIZE]; + pub fn chunk_mut(&mut self, at: usize) -> &mut [u32; Self::CHUNK_SIZE] { + let s = + &mut self.mutations[at * Self::CHUNK_SIZE..at * Self::CHUNK_SIZE + Self::CHUNK_SIZE]; s.try_into().unwrap() } // The None path may not be the most efficient - fn new_chunk_mut(&mut self) -> (usize, &mut [u32; CHUNK_SIZE]) { + fn new_chunk_mut(&mut self) -> (usize, &mut [u32; Self::CHUNK_SIZE]) { assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); - match self.queue.pop() { Some(index) => (index, self.chunk_mut(index)), None => { From 5b19cb6eb73378f447250f5bc2449824fd5b850a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 14:43:08 -0700 Subject: [PATCH 16/72] steps towards a working fn --- src/genome_chunked_array.rs | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 328adb2..170be02 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -112,24 +112,26 @@ impl Genomes { // 3. Need partition searching to copy parental stuff over. // 4. Etc.. fn generate_offspring_genome( - mutation_keys: &[u32], // The indexes of the new mutations // NOTE: we are already bowing // to the borrow checker: // we cannot have parental/offspring // genomes in the same container. // Do "chunks" and "genomes" need to // be in the same struct? - parents: (&[u32], &[u32]), // parental genomes - mutation_chunks: &mut MutationChunks, // Output chunks - genomes: &mut Genomes, // Output genomes + parents: (&[u32], &[u32]), // parental genomes + mutations: &[Mutation], + parent_mutation_chunks: &MutationChunks, // Output chunks + new_mutation_keys: &[u32], // The indexes of the new mutations + offspring_mutation_chunks: &mut MutationChunks, // Output chunks + genomes: &mut Genomes, // Output genomes ) { let parent_one_genome = parents.0; let parent_two_genome = parents.1; if genomes.is_empty() { genomes.starts.push(0); genomes.stops.push(CHUNK_SIZE); - genomes.chunks.extend_from_slice(mutation_keys); - for _ in 0..CHUNK_SIZE - mutation_keys.len() { + genomes.chunks.extend_from_slice(new_mutation_keys); + for _ in 0..CHUNK_SIZE - new_mutation_keys.len() { genomes.chunks.push(u32::MAX); } } @@ -149,9 +151,22 @@ mod development_tests { stops: vec![], }; + let mutations = vec![ + Mutation::new(0.try_into().unwrap(), vec![], 0.into()), + Mutation::new(1.try_into().unwrap(), vec![], 0.into()), + Mutation::new(1.try_into().unwrap(), vec![], 0.into()), + ]; + let pchunks = MutationChunks::default(); let mut chunks = MutationChunks::default(); - let mutations = [1, 2, 3]; - generate_offspring_genome(&mutations, (&[], &[]), &mut chunks, &mut genomes); + let new_mutations = [0, 1, 2]; + generate_offspring_genome( + (&[], &[]), + &mutations, + &pchunks, + &new_mutations, + &mut chunks, + &mut genomes, + ); assert_eq!(genomes.starts.len(), 1); assert_eq!(genomes.stops.len(), 1); assert_eq!(genomes.stops[0] - genomes.starts[0], CHUNK_SIZE); From 7f6eb477b541ef0bbfe827f6aabf38ced549900f Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 15:08:39 -0700 Subject: [PATCH 17/72] the borrow checker stops us. --- src/genome_chunked_array.rs | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 170be02..a7afe38 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -97,6 +97,15 @@ impl Genomes { } } +fn update_offspring_chunks<'c>( + offpsring_chunk_index: usize, + offspring_genomes: &mut Genomes, + offspring_mutation_chunks: &'c mut MutationChunks, + current_chunk: &'c mut [u32; CHUNK_SIZE], +) -> (usize, &'c mut [u32; CHUNK_SIZE]) { + (offpsring_chunk_index, current_chunk) +} + // What we really need: // 1. Do we need a new chunk? // 2. If yes, push (or recycle) one. @@ -126,14 +135,16 @@ fn generate_offspring_genome( genomes: &mut Genomes, // Output genomes ) { let parent_one_genome = parents.0; - let parent_two_genome = parents.1; - if genomes.is_empty() { - genomes.starts.push(0); - genomes.stops.push(CHUNK_SIZE); - genomes.chunks.extend_from_slice(new_mutation_keys); - for _ in 0..CHUNK_SIZE - new_mutation_keys.len() { - genomes.chunks.push(u32::MAX); - } + + let mut last_parent_index = 0_usize; + let (mut index, mut chunk) = offspring_mutation_chunks.new_chunk_mut(); + for m in new_mutation_keys { + // FIXME: here, we want to be skipping past chunks + // whose last position is < mutations[m].position() + let p = parent_one_genome.partition_point(|x| true); + + (index, chunk) = + update_offspring_chunks(index, genomes, offspring_mutation_chunks, &mut chunk); } } From 7765bd3c8e04b3264cc4abef6343f3888edb498e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 15:40:52 -0700 Subject: [PATCH 18/72] use indexes to get around the borrow checker --- src/genome_chunked_array.rs | 57 +++++++++++++------------------------ 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index a7afe38..e7279fd 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -37,9 +37,16 @@ mod pseudocode { } } +#[derive(Copy, Clone, Debug)] struct Chunk { - start: u32, - stop: u32, + start: usize, + stop: usize, +} + +impl Chunk { + fn new(start: usize, stop: usize) -> Self { + Self { start, stop } + } } #[derive(Default)] @@ -54,27 +61,20 @@ pub struct MutationChunks { impl MutationChunks { const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? - pub fn chunk(&self, at: usize) -> &[u32; Self::CHUNK_SIZE] { - let s = &self.mutations[at * Self::CHUNK_SIZE..at * Self::CHUNK_SIZE + Self::CHUNK_SIZE]; - s.try_into().unwrap() - } - - pub fn chunk_mut(&mut self, at: usize) -> &mut [u32; Self::CHUNK_SIZE] { - let s = - &mut self.mutations[at * Self::CHUNK_SIZE..at * Self::CHUNK_SIZE + Self::CHUNK_SIZE]; - s.try_into().unwrap() + fn chunk(&self, at: usize) -> Chunk { + Chunk::new(at * Self::CHUNK_SIZE, at * Self::CHUNK_SIZE + CHUNK_SIZE) } // The None path may not be the most efficient - fn new_chunk_mut(&mut self) -> (usize, &mut [u32; Self::CHUNK_SIZE]) { + fn add_new_chunk(&mut self) -> (usize, Chunk) { assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); match self.queue.pop() { - Some(index) => (index, self.chunk_mut(index)), + Some(index) => (index, self.chunk(index)), None => { let id = self.mutations.len() / Self::CHUNK_SIZE; self.mutations .resize(self.mutations.len() + Self::CHUNK_SIZE, u32::MAX); - (id, self.chunk_mut(id)) + (id, self.chunk(id)) } } } @@ -97,12 +97,12 @@ impl Genomes { } } -fn update_offspring_chunks<'c>( +fn update_offspring_chunks( offpsring_chunk_index: usize, offspring_genomes: &mut Genomes, - offspring_mutation_chunks: &'c mut MutationChunks, - current_chunk: &'c mut [u32; CHUNK_SIZE], -) -> (usize, &'c mut [u32; CHUNK_SIZE]) { + offspring_mutation_chunks: &mut MutationChunks, + current_chunk: Chunk, +) -> (usize, Chunk) { (offpsring_chunk_index, current_chunk) } @@ -137,14 +137,13 @@ fn generate_offspring_genome( let parent_one_genome = parents.0; let mut last_parent_index = 0_usize; - let (mut index, mut chunk) = offspring_mutation_chunks.new_chunk_mut(); + let (mut index, mut chunk) = offspring_mutation_chunks.add_new_chunk(); for m in new_mutation_keys { // FIXME: here, we want to be skipping past chunks // whose last position is < mutations[m].position() let p = parent_one_genome.partition_point(|x| true); - (index, chunk) = - update_offspring_chunks(index, genomes, offspring_mutation_chunks, &mut chunk); + (index, chunk) = update_offspring_chunks(index, genomes, offspring_mutation_chunks, chunk); } } @@ -191,22 +190,6 @@ mod sinful_tests { use super::*; - #[test] - fn test_sizes() { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::>() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::>>() - ); - assert_eq!( - std::mem::size_of::>(), - std::mem::size_of::() - ); - } - #[test] fn test_non_zero_int_types() { let x = NonZeroU32::new(1).unwrap(); From b8d3f0c1338a1cd1633f4afe01fc6641049b3557 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 15:43:48 -0700 Subject: [PATCH 19/72] ugh, maybe struct approach is simpler at first --- src/genome_chunked_array.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index e7279fd..30b749a 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -97,6 +97,11 @@ impl Genomes { } } +// FIXME: this info is incomplete: +// * We don't know if we are writing into +// a partially-filled chunk. +// * We don't know if we are returning an empty +// or partially-filled chunk fn update_offspring_chunks( offpsring_chunk_index: usize, offspring_genomes: &mut Genomes, From 1eecc926a2a838a15af9eadcbcf24f045dbe7b90 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Tue, 2 May 2023 16:05:32 -0700 Subject: [PATCH 20/72] defeated by the borrow checker. More to come tomorrow... --- src/genome_chunked_array.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 30b749a..361b055 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -139,6 +139,7 @@ fn generate_offspring_genome( offspring_mutation_chunks: &mut MutationChunks, // Output chunks genomes: &mut Genomes, // Output genomes ) { + todo!("this may all be wrong. for efficiency, we may need the structure to be update-in-place and not fully DoD"); let parent_one_genome = parents.0; let mut last_parent_index = 0_usize; From 600c5327338acc3787914cedd3efd3b008c9a8ec Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 10:20:00 -0700 Subject: [PATCH 21/72] some notes --- src/genome_chunked_array.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 361b055..1094748 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -1,3 +1,16 @@ +//! This is a first draft of storing genomes +//! in "chunks" of a fixed length. A "chunk" +//! is a number of mutations. +//! +//! For this first pass, we will only support +//! efficient operations for non-overlapping +//! generations. Like genome_array.rs, each +//! new generation of genomes will be a new +//! container. Also like genome_array, this +//! design (probably) has efficiency issues +//! for overlapping generations. We will deal +//! with that later. + use crate::common::Mutation; // Mutations will be stored in blocks of 64 @@ -139,7 +152,6 @@ fn generate_offspring_genome( offspring_mutation_chunks: &mut MutationChunks, // Output chunks genomes: &mut Genomes, // Output genomes ) { - todo!("this may all be wrong. for efficiency, we may need the structure to be update-in-place and not fully DoD"); let parent_one_genome = parents.0; let mut last_parent_index = 0_usize; From d28bef2634d6b4fd5ad9876b554753602f161945 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 14:38:25 -0700 Subject: [PATCH 22/72] strip to bare bones based on pencil + paper sketches --- src/genome_chunked_array.rs | 221 ++---------------------------------- 1 file changed, 12 insertions(+), 209 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 1094748..685790d 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -50,219 +50,22 @@ mod pseudocode { } } -#[derive(Copy, Clone, Debug)] -struct Chunk { - start: usize, - stop: usize, -} - -impl Chunk { - fn new(start: usize, stop: usize) -> Self { - Self { start, stop } - } +struct Mutations { + mutations: Vec, + counts: Vec, } #[derive(Default)] -pub struct MutationChunks { - mutations: Vec, // indexes into mutation vector. - // u32::MAX is treated as a NULL "sentinel" - chunks: Vec, // Is this needed? +struct MutationChunks { + mutation_ids: Vec, // indexes into mutation vector. counts: Vec, - queue: Vec, -} - -impl MutationChunks { - const CHUNK_SIZE: usize = CHUNK_SIZE; // Maybe this should be here? - - fn chunk(&self, at: usize) -> Chunk { - Chunk::new(at * Self::CHUNK_SIZE, at * Self::CHUNK_SIZE + CHUNK_SIZE) - } - - // The None path may not be the most efficient - fn add_new_chunk(&mut self) -> (usize, Chunk) { - assert_eq!(self.mutations.len() / Self::CHUNK_SIZE, 0); - match self.queue.pop() { - Some(index) => (index, self.chunk(index)), - None => { - let id = self.mutations.len() / Self::CHUNK_SIZE; - self.mutations - .resize(self.mutations.len() + Self::CHUNK_SIZE, u32::MAX); - (id, self.chunk(id)) - } - } - } -} - -struct Genomes { - chunks: Vec, // each genome is a range of chunk indexes - starts: Vec, // For each genome, where is its first chunk? - stops: Vec, // One past last chunk such that a genome is chunks[starts[i]..stops[i]] -} - -impl Genomes { - fn is_empty(&self) -> bool { - assert_eq!(self.starts.len(), self.stops.len()); - self.starts.is_empty() - } - - fn genome(&self, index: usize) -> &[u32] { - &self.chunks[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE] - } -} - -// FIXME: this info is incomplete: -// * We don't know if we are writing into -// a partially-filled chunk. -// * We don't know if we are returning an empty -// or partially-filled chunk -fn update_offspring_chunks( - offpsring_chunk_index: usize, - offspring_genomes: &mut Genomes, - offspring_mutation_chunks: &mut MutationChunks, - current_chunk: Chunk, -) -> (usize, Chunk) { - (offpsring_chunk_index, current_chunk) -} - -// What we really need: -// 1. Do we need a new chunk? -// 2. If yes, push (or recycle) one. -// 3. If no, fill what we can of the current (last?) chunk. -// Actually, a lot of this is wrong: -// 1. We don't know what a "genome" is, yet, in memory. -// 2. So, are we adding to a genome, updating an existing -// genome, or what? -// So what we actually really need is: -// 1. To know the parental genome identifiers -// 2. To be able to get new chunks, which means that MutationChunks -// has to be part of the conversation. -// 3. Need partition searching to copy parental stuff over. -// 4. Etc.. -fn generate_offspring_genome( - // NOTE: we are already bowing - // to the borrow checker: - // we cannot have parental/offspring - // genomes in the same container. - // Do "chunks" and "genomes" need to - // be in the same struct? - parents: (&[u32], &[u32]), // parental genomes - mutations: &[Mutation], - parent_mutation_chunks: &MutationChunks, // Output chunks - new_mutation_keys: &[u32], // The indexes of the new mutations - offspring_mutation_chunks: &mut MutationChunks, // Output chunks - genomes: &mut Genomes, // Output genomes -) { - let parent_one_genome = parents.0; - - let mut last_parent_index = 0_usize; - let (mut index, mut chunk) = offspring_mutation_chunks.add_new_chunk(); - for m in new_mutation_keys { - // FIXME: here, we want to be skipping past chunks - // whose last position is < mutations[m].position() - let p = parent_one_genome.partition_point(|x| true); - - (index, chunk) = update_offspring_chunks(index, genomes, offspring_mutation_chunks, chunk); - } -} - -impl Genomes {} - -#[cfg(test)] -mod development_tests { - use super::*; - - #[test] - fn test_add_mutations() { - let mut genomes = Genomes { - chunks: vec![], - starts: vec![], - stops: vec![], - }; - - let mutations = vec![ - Mutation::new(0.try_into().unwrap(), vec![], 0.into()), - Mutation::new(1.try_into().unwrap(), vec![], 0.into()), - Mutation::new(1.try_into().unwrap(), vec![], 0.into()), - ]; - let pchunks = MutationChunks::default(); - let mut chunks = MutationChunks::default(); - let new_mutations = [0, 1, 2]; - generate_offspring_genome( - (&[], &[]), - &mutations, - &pchunks, - &new_mutations, - &mut chunks, - &mut genomes, - ); - assert_eq!(genomes.starts.len(), 1); - assert_eq!(genomes.stops.len(), 1); - assert_eq!(genomes.stops[0] - genomes.starts[0], CHUNK_SIZE); - assert_eq!(genomes.chunks.len(), CHUNK_SIZE); - } + // How many valid mutation ids are in a chunk. + // The rest must be the sentinel value u32::MAX + occupancy: Vec, } -#[cfg(test)] -mod sinful_tests { - use std::num::NonZeroU32; - - use super::*; - - #[test] - fn test_non_zero_int_types() { - let x = NonZeroU32::new(1).unwrap(); - let y = x.checked_add(1).unwrap(); - assert_eq!(y, 2.try_into().unwrap()); - } - - //#[test] - //fn test_arrayref() { - // fn mod_array(x: &mut [i32; 3]) { - // x[2] = 4; - // } - // use arrayref::array_mut_ref; - // let mut x = vec![1, 2, 3]; - // mod_array(array_mut_ref![x, 0, 3]); - // assert_eq!(x[2], 4); - //} - - #[test] - fn test_slice_try_into_array() { - let mut v = vec![1; 64]; - let vs = &mut v[0..32]; - let s: &mut [i32; 32] = vs.try_into().unwrap(); - s[10] = 2; - assert_eq!(v[10], 2); - let vs = &mut v[0..64]; - let s: &mut [i32; 64] = vs.try_into().unwrap(); - s[10] = 3; - assert_eq!(v[10], 3); - - let mut v = vec![1; 256]; - let vs = &mut v[9..9 + 64]; - let s: &mut [i32; 64] = vs.try_into().unwrap(); - s[1] = 3; - assert_eq!(v[10], 3); - } - - #[test] - fn test_copy_free_array_from_struct() { - struct X { - x: Vec, - } - - impl X { - fn chunk(&mut self) -> &mut [i32; 4] { - let s = &mut self.x[0..4]; - s.try_into().unwrap() - } - } - - let mut x = X { - x: vec![0, 1, 2, 3], - }; - let c = x.chunk(); - c[0] += 10; - assert_eq!(x.x[0], 10); - } +struct HaploidGenomes { + mutation_chunk_ids: Vec, // each genome is a CONTIGUOUS range of chunk indexes + starts: Vec, // For each genome, where is its first chunk? + stops: Vec, // One past last chunk such that a genome is mutation_chunk_ids[starts[i]..stops[i]] } From 7b6553e391d51e8833763ebfc3ba2caa5a630346 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 14:43:00 -0700 Subject: [PATCH 23/72] Default --- src/genome_chunked_array.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 685790d..138276b 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -50,6 +50,7 @@ mod pseudocode { } } +#[derive(Default)] struct Mutations { mutations: Vec, counts: Vec, @@ -64,6 +65,7 @@ struct MutationChunks { occupancy: Vec, } +#[derive(Default)] struct HaploidGenomes { mutation_chunk_ids: Vec, // each genome is a CONTIGUOUS range of chunk indexes starts: Vec, // For each genome, where is its first chunk? From 5acb47f7a98d11937c51b72bb6c21dcb6d5023ad Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:03:05 -0700 Subject: [PATCH 24/72] some basic tets --- src/genome_chunked_array.rs | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 138276b..1ac4658 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -63,6 +63,43 @@ struct MutationChunks { // How many valid mutation ids are in a chunk. // The rest must be the sentinel value u32::MAX occupancy: Vec, + // indexes where chunks have count of 0. + // We can use these to "reallocate" new chunks. + queue: Vec, +} + +impl MutationChunks { + fn new_chunk(&mut self) -> usize { + match self.queue.pop() { + Some(index) => { + assert_eq!(self.counts[index], 0); + self.mutation_ids[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE] + .fill(u32::MAX); + self.occupancy[index] = 0; + index + } + None => { + let index = self.mutation_ids.len(); + self.mutation_ids.resize(index + CHUNK_SIZE, u32::MAX); + self.occupancy.push(0); + self.counts.push(0); + index / CHUNK_SIZE + } + } + } + + fn occupancy(&self, at: usize) -> i32 { + self.occupancy[at] + } + + fn fill_queue(&mut self) { + self.queue = self + .counts + .iter() + .enumerate() + .filter_map(|(i, c)| if c == &0 { Some(i) } else { None }) + .collect(); + } } #[derive(Default)] @@ -71,3 +108,35 @@ struct HaploidGenomes { starts: Vec, // For each genome, where is its first chunk? stops: Vec, // One past last chunk such that a genome is mutation_chunk_ids[starts[i]..stops[i]] } + +#[cfg(test)] +mod test_mutation_chunks { + use super::MutationChunks; + + #[test] + fn test_add_chunk() { + let mut mc = MutationChunks::default(); + let nc = mc.new_chunk(); + assert_eq!(nc, 0); + assert_eq!(mc.occupancy(nc), 0); + } + + #[test] + fn test_recycle_chunk() { + let mut mc = MutationChunks::default(); + let nc = mc.new_chunk(); + assert_eq!(nc, 0); + assert_eq!(mc.occupancy(nc), 0); + mc.fill_queue(); + let nc = mc.new_chunk(); + assert_eq!(nc, 0); + mc.counts[0] += 1; + mc.fill_queue(); + let nc = mc.new_chunk(); + assert_eq!(nc, 1); + mc.counts[1] += 1; + mc.fill_queue(); + let nc = mc.new_chunk(); + assert_eq!(nc, 2); + } +} From 27caa974d5ccef03f72794312a22442487839682 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:07:36 -0700 Subject: [PATCH 25/72] add is_empty --- src/genome_chunked_array.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 1ac4658..47021d9 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -88,8 +88,12 @@ impl MutationChunks { } } - fn occupancy(&self, at: usize) -> i32 { - self.occupancy[at] + fn occupancy(&self, chunk: usize) -> i32 { + self.occupancy[chunk] + } + + fn is_empty(&self, chunk: usize) -> bool { + self.occupancy(chunk) == 0 } fn fill_queue(&mut self) { @@ -126,7 +130,7 @@ mod test_mutation_chunks { let mut mc = MutationChunks::default(); let nc = mc.new_chunk(); assert_eq!(nc, 0); - assert_eq!(mc.occupancy(nc), 0); + assert!(mc.is_empty(nc)); mc.fill_queue(); let nc = mc.new_chunk(); assert_eq!(nc, 0); From 26101d3a394b2fa14a2891de337e52422cf73608 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:24:11 -0700 Subject: [PATCH 26/72] copying one chunk into another --- src/genome_chunked_array.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 47021d9..3a8dca5 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -104,6 +104,13 @@ impl MutationChunks { .filter_map(|(i, c)| if c == &0 { Some(i) } else { None }) .collect(); } + + fn fill_from(&mut self, source: usize, destination: usize) { + for i in 0..self.occupancy(source) { + self.mutation_ids[destination * CHUNK_SIZE + i as usize] = + self.mutation_ids[source * CHUNK_SIZE + i as usize]; + } + } } #[derive(Default)] @@ -116,6 +123,7 @@ struct HaploidGenomes { #[cfg(test)] mod test_mutation_chunks { use super::MutationChunks; + use super::CHUNK_SIZE; #[test] fn test_add_chunk() { @@ -143,4 +151,24 @@ mod test_mutation_chunks { let nc = mc.new_chunk(); assert_eq!(nc, 2); } + + #[test] + fn test_fill_entire_chunk_from_another_full_chunk() { + let mut mc = MutationChunks::default(); + let first = mc.new_chunk(); + let second = mc.new_chunk(); + + // Make up some data + for i in 0..CHUNK_SIZE { + mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + } + mc.occupancy[first] = CHUNK_SIZE as i32; + + mc.fill_from(first, second); + let s = &mc.mutation_ids[second * CHUNK_SIZE..]; + for i in 0..CHUNK_SIZE { + assert_eq!(s[i], i as u32); + } + assert_eq!(mc.occupancy(second), CHUNK_SIZE as i32); + } } From 0912ebefe3d2bd1fbe5425dbaa137e100acaea47 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:25:39 -0700 Subject: [PATCH 27/72] tests passsing --- src/genome_chunked_array.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 3a8dca5..57dbf06 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -106,10 +106,12 @@ impl MutationChunks { } fn fill_from(&mut self, source: usize, destination: usize) { + assert_ne!(source, destination); for i in 0..self.occupancy(source) { self.mutation_ids[destination * CHUNK_SIZE + i as usize] = self.mutation_ids[source * CHUNK_SIZE + i as usize]; } + self.occupancy[destination] = self.occupancy[source]; } } From c5a44eec3902e830903c0031bda3392683002451 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:29:15 -0700 Subject: [PATCH 28/72] add to occupancy --- src/genome_chunked_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 57dbf06..7efcbf3 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -111,7 +111,7 @@ impl MutationChunks { self.mutation_ids[destination * CHUNK_SIZE + i as usize] = self.mutation_ids[source * CHUNK_SIZE + i as usize]; } - self.occupancy[destination] = self.occupancy[source]; + self.occupancy[destination] += self.occupancy[source]; } } From d3c6514e84d0b2f37014e7be398054165f4851ca Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 15:44:34 -0700 Subject: [PATCH 29/72] comment --- src/genome_chunked_array.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 7efcbf3..d1560d3 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -107,6 +107,11 @@ impl MutationChunks { fn fill_from(&mut self, source: usize, destination: usize) { assert_ne!(source, destination); + // NOTE: code like this could be a perf bottleneck. + // We may need to get each mutable sub-slice out + // using .split_at_mut() and some indexing. + // With the slices, iterators may be helpful + // in dodging boundary checks. for i in 0..self.occupancy(source) { self.mutation_ids[destination * CHUNK_SIZE + i as usize] = self.mutation_ids[source * CHUNK_SIZE + i as usize]; From 57c17483ce4aee4dfda14d0e524748ba4c318907 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Wed, 3 May 2023 18:31:18 -0700 Subject: [PATCH 30/72] note for later --- src/genome_chunked_array.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index d1560d3..f88a415 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -105,6 +105,9 @@ impl MutationChunks { .collect(); } + + // Not clear that we really want this kind of logic. + // This fn may disappear later. fn fill_from(&mut self, source: usize, destination: usize) { assert_ne!(source, destination); // NOTE: code like this could be a perf bottleneck. From 63250516a5edd2990ad71ace29a4f1d1daab1601 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 09:26:30 -0700 Subject: [PATCH 31/72] Note --- src/genome_chunked_array.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index f88a415..be6f0ac 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -62,6 +62,10 @@ struct MutationChunks { counts: Vec, // How many valid mutation ids are in a chunk. // The rest must be the sentinel value u32::MAX + // NOTE: defining this as "how many" is akin to + // the len() of a Vec. That definition may + // make doing things like getting + // the last position out of a chunk awkward. occupancy: Vec, // indexes where chunks have count of 0. // We can use these to "reallocate" new chunks. From 2dd89c2752950b3dd8c99ffbd76dd07b75ac723d Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 09:45:58 -0700 Subject: [PATCH 32/72] basic position fetching --- src/genome_chunked_array.rs | 52 ++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index be6f0ac..63cd2c9 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -109,7 +109,6 @@ impl MutationChunks { .collect(); } - // Not clear that we really want this kind of logic. // This fn may disappear later. fn fill_from(&mut self, source: usize, destination: usize) { @@ -125,6 +124,29 @@ impl MutationChunks { } self.occupancy[destination] += self.occupancy[source]; } + + fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { + let o = self.occupancy(chunk); + match o { + x if x == 0 => None, + x if x > 0 && ((x as usize) <= CHUNK_SIZE) => { + Some(mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position()) + } + _ => panic!("invalid occupancy value"), + } + } + + fn last_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { + let o = self.occupancy(chunk); + match o { + x if x == 0 => None, + x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some( + mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1) as usize] as usize] + .position(), + ), + _ => panic!("invalid occupancy value"), + } + } } #[derive(Default)] @@ -136,6 +158,7 @@ struct HaploidGenomes { #[cfg(test)] mod test_mutation_chunks { + use super::Mutation; use super::MutationChunks; use super::CHUNK_SIZE; @@ -153,6 +176,8 @@ mod test_mutation_chunks { let nc = mc.new_chunk(); assert_eq!(nc, 0); assert!(mc.is_empty(nc)); + assert!(mc.last_position(nc, &[]).is_none()); + assert!(mc.first_position(nc, &[]).is_none()); mc.fill_queue(); let nc = mc.new_chunk(); assert_eq!(nc, 0); @@ -166,6 +191,31 @@ mod test_mutation_chunks { assert_eq!(nc, 2); } + #[test] + fn test_first_last_pos_full_chunk() { + let mut mc = MutationChunks::default(); + let first = mc.new_chunk(); + + // Make up some data + let mut mutations = vec![]; + for i in 0..CHUNK_SIZE { + mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + let pos = i64::try_from(i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + } + mc.occupancy[first] = CHUNK_SIZE as i32; + assert!(mc.first_position(first, &mutations).is_some()); + assert!(mc.last_position(first, &mutations).is_some()); + assert_eq!( + mc.first_position(first, &mutations), + Some(forrustts::Position::try_from(0_i64).unwrap()) + ); + assert_eq!( + mc.last_position(first, &mutations), + Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) + ); + } + #[test] fn test_fill_entire_chunk_from_another_full_chunk() { let mut mc = MutationChunks::default(); From e9aafbe20c30adf6c46feefb2f33a06d8dadee90 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 10:00:14 -0700 Subject: [PATCH 33/72] dedup --- src/genome_chunked_array.rs | 57 ++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 63cd2c9..c68db78 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -125,27 +125,58 @@ impl MutationChunks { self.occupancy[destination] += self.occupancy[source]; } - fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { + fn position_details( + &self, + chunk: usize, + mutations: &[Mutation], + f: F, + ) -> Option + where + F: Fn(usize, usize, &[Mutation]) -> forrustts::Position, + { let o = self.occupancy(chunk); match o { x if x == 0 => None, - x if x > 0 && ((x as usize) <= CHUNK_SIZE) => { - Some(mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position()) - } + x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some(f(chunk, x as usize, mutations)), _ => panic!("invalid occupancy value"), } } + // In general, we don't want to mess w/empty chunks, + // other than when we FIRST create one. + // so we may revisit some of this logic later + // FIXME: code duplication w/last_position + fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { + self.position_details(chunk, mutations, |c, _, m| { + m[self.mutation_ids[c * CHUNK_SIZE] as usize].position() + }) + //let o = self.occupancy(chunk); + //match o { + // x if x == 0 => None, + // x if x > 0 && ((x as usize) <= CHUNK_SIZE) => { + // Some(mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position()) + // } + // _ => panic!("invalid occupancy value"), + //} + } + + // In general, we don't want to mess w/empty chunks, + // other than when we FIRST create one. + // so we may revisit some of this logic later + // FIXME: code duplication w/first_position fn last_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { - let o = self.occupancy(chunk); - match o { - x if x == 0 => None, - x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some( - mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1) as usize] as usize] - .position(), - ), - _ => panic!("invalid occupancy value"), - } + self.position_details(chunk, mutations, |c, x, m| { + m[self.mutation_ids[c * CHUNK_SIZE + (x - 1)] as usize].position() + }) + //let o = self.occupancy(chunk); + //match o { + // x if x == 0 => None, + // x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some( + // mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1) as usize] as usize] + // .position(), + // ), + // _ => panic!("invalid occupancy value"), + //} } } From 1e59416b81cfffcdd7863263f394b6d776f129f8 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 10:01:44 -0700 Subject: [PATCH 34/72] simplify --- src/genome_chunked_array.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index c68db78..aeabb2b 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -125,19 +125,14 @@ impl MutationChunks { self.occupancy[destination] += self.occupancy[source]; } - fn position_details( - &self, - chunk: usize, - mutations: &[Mutation], - f: F, - ) -> Option + fn position_details(&self, chunk: usize, f: F) -> Option where - F: Fn(usize, usize, &[Mutation]) -> forrustts::Position, + F: Fn(usize) -> forrustts::Position, { let o = self.occupancy(chunk); match o { x if x == 0 => None, - x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some(f(chunk, x as usize, mutations)), + x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some(f(x as usize)), _ => panic!("invalid occupancy value"), } } @@ -147,8 +142,8 @@ impl MutationChunks { // so we may revisit some of this logic later // FIXME: code duplication w/last_position fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { - self.position_details(chunk, mutations, |c, _, m| { - m[self.mutation_ids[c * CHUNK_SIZE] as usize].position() + self.position_details(chunk, |_| { + mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position() }) //let o = self.occupancy(chunk); //match o { @@ -165,8 +160,8 @@ impl MutationChunks { // so we may revisit some of this logic later // FIXME: code duplication w/first_position fn last_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { - self.position_details(chunk, mutations, |c, x, m| { - m[self.mutation_ids[c * CHUNK_SIZE + (x - 1)] as usize].position() + self.position_details(chunk, |x| { + mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1)] as usize].position() }) //let o = self.occupancy(chunk); //match o { From 823cb54b2386bdb650fd9173dff52f405e38e407 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 10:02:41 -0700 Subject: [PATCH 35/72] cleanup --- src/genome_chunked_array.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index aeabb2b..d1f8926 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -145,14 +145,6 @@ impl MutationChunks { self.position_details(chunk, |_| { mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position() }) - //let o = self.occupancy(chunk); - //match o { - // x if x == 0 => None, - // x if x > 0 && ((x as usize) <= CHUNK_SIZE) => { - // Some(mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position()) - // } - // _ => panic!("invalid occupancy value"), - //} } // In general, we don't want to mess w/empty chunks, @@ -163,15 +155,6 @@ impl MutationChunks { self.position_details(chunk, |x| { mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1)] as usize].position() }) - //let o = self.occupancy(chunk); - //match o { - // x if x == 0 => None, - // x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some( - // mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1) as usize] as usize] - // .position(), - // ), - // _ => panic!("invalid occupancy value"), - //} } } From b82227d618c68903a1cbfeca43a3860bb9ca39de Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 10:46:04 -0700 Subject: [PATCH 36/72] FIXME fixed --- src/genome_chunked_array.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index d1f8926..c353c1e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -140,7 +140,6 @@ impl MutationChunks { // In general, we don't want to mess w/empty chunks, // other than when we FIRST create one. // so we may revisit some of this logic later - // FIXME: code duplication w/last_position fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { self.position_details(chunk, |_| { mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position() @@ -150,7 +149,6 @@ impl MutationChunks { // In general, we don't want to mess w/empty chunks, // other than when we FIRST create one. // so we may revisit some of this logic later - // FIXME: code duplication w/first_position fn last_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { self.position_details(chunk, |x| { mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1)] as usize].position() From aad26dbf26bd306e6969ef7bfeb8a529775a0178 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 11:16:52 -0700 Subject: [PATCH 37/72] on a path to ruin here... --- src/genome_chunked_array.rs | 92 +++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index c353c1e..80e1cc9 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -243,3 +243,95 @@ mod test_mutation_chunks { assert_eq!(mc.occupancy(second), CHUNK_SIZE as i32); } } + +#[cfg(test)] +mod test_haploid_genomes { + use super::*; + + // I STRONGLY DISLIKE the semantics here. + // This will lead to all sorts of edge cases + #[test] + fn test_search_thru_empty_chunk() { + let mut mc = MutationChunks::default(); + let first = mc.new_chunk(); + let second = mc.new_chunk(); + let third = mc.new_chunk(); + let fourth = mc.new_chunk(); + let mut mutations = vec![]; + for i in 0..CHUNK_SIZE { + mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + let pos = i64::try_from(i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mc.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); + } + mc.occupancy[first] = CHUNK_SIZE as i32; + mc.occupancy[third] = CHUNK_SIZE as i32; + + assert!(mc.first_position(second, &mutations).is_none()); + assert!(mc.last_position(second, &mutations).is_none()); + assert!(mc.first_position(fourth, &mutations).is_none()); + assert!(mc.last_position(fourth, &mutations).is_none()); + assert_eq!( + mc.first_position(first, &mutations), + Some(forrustts::Position::try_from(0).unwrap()) + ); + assert_eq!( + mc.last_position(first, &mutations), + Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) + ); + assert_eq!( + mc.first_position(third, &mutations), + Some(forrustts::Position::try_from(CHUNK_SIZE as i64).unwrap()) + ); + assert_eq!( + mc.last_position(third, &mutations), + Some(forrustts::Position::try_from((2 * CHUNK_SIZE as i64) - 1).unwrap()) + ); + + // Okay, so now we can set up some haploid genomes + let mut haploid_genomes = HaploidGenomes::default(); + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(second as u32); + haploid_genomes.mutation_chunk_ids.push(third as u32); + haploid_genomes.mutation_chunk_ids.push(fourth as u32); + haploid_genomes.starts.push(0); + haploid_genomes.stops.push(4); + + // Okay, now we have a genome with chunks that are full, empty, full. + // We envision this happening when, at the end of a generation, + // all mutations in chunk 1 (2nd chunk) become fixed. The idea is that + // we will mark all elements corresponding to fixed mutations with + // our sentinel value. + + // The goal of this test is to be able to correctly use partition_point + // for this kind of case. + let test_position = forrustts::Position::try_from(100).unwrap(); + assert!(test_position >= mc.first_position(third, &mutations).unwrap()); + assert!(test_position < mc.last_position(third, &mutations).unwrap()); + + let genome = &haploid_genomes.mutation_chunk_ids + [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; + println!("{genome:?}"); + let p = genome.partition_point(|&c| { + let comp = mc.last_position(c as usize, &mutations).is_none() + || mc.last_position(c as usize, &mutations).unwrap() < test_position; + println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); + comp + }); + assert!(p > 0); + assert_eq!(p, 2); + + let test_position = forrustts::Position::try_from(200).unwrap(); + let p = genome.partition_point(|&c| { + let comp = mc.last_position(c as usize, &mutations).is_none() + || mc.last_position(c as usize, &mutations).unwrap() < test_position; + println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); + comp + }); + assert_eq!(p, 4); + } +} From b08b635540888bf07f30729e498a4edcbcd67780 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 11:42:49 -0700 Subject: [PATCH 38/72] getting somewhere --- src/genome_chunked_array.rs | 117 ++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 80e1cc9..d4b734e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -156,6 +156,32 @@ impl MutationChunks { } } +// This is a quick implementation of C++'s remove_if, which is a method +// of partitioning a slice such that retained items are STABLY moved to the +// front. +// NOTE: we can probably do this with generics later on +// NOTE: should the be associated w/HaploidGenomes? +fn remove_chunk_if_empty( + mutation_chunk_ids: &mut [u32], + mutation_chunks: &MutationChunks, +) -> usize { + let mut first = match mutation_chunk_ids + .iter() + .position(|&c| mutation_chunks.is_empty(c as usize)) + { + Some(index) => index, + None => mutation_chunk_ids.len(), + }; + let f = first; + for i in f..mutation_chunk_ids.len() { + if !mutation_chunks.is_empty(i) { + mutation_chunk_ids[first] = mutation_chunk_ids[i]; + first += 1; + } + } + first +} + #[derive(Default)] struct HaploidGenomes { mutation_chunk_ids: Vec, // each genome is a CONTIGUOUS range of chunk indexes @@ -163,6 +189,20 @@ struct HaploidGenomes { stops: Vec, // One past last chunk such that a genome is mutation_chunk_ids[starts[i]..stops[i]] } +impl HaploidGenomes { + fn move_empty_chunks(&mut self, mutation_chunks: &MutationChunks) { + assert_eq!(self.starts.len(), self.stops.len()); + let ngenomes = self.starts.len(); + for genome in 0..ngenomes { + let chunks = &mut self.mutation_chunk_ids[self.starts[genome]..self.stops[genome]]; + println!("{chunks:?}"); + let nremoved = remove_chunk_if_empty(chunks, mutation_chunks); + println!("{chunks:?}"); + self.stops[0] = nremoved; + } + } +} + #[cfg(test)] mod test_mutation_chunks { use super::Mutation; @@ -334,4 +374,81 @@ mod test_haploid_genomes { }); assert_eq!(p, 4); } + + // This leads to saner/safer semantics than the above + #[test] + fn purge_empty_chunks() { + let mut mc = MutationChunks::default(); + let first = mc.new_chunk(); + let second = mc.new_chunk(); + let third = mc.new_chunk(); + let fourth = mc.new_chunk(); + let mut mutations = vec![]; + for i in 0..CHUNK_SIZE { + mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + let pos = i64::try_from(i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mc.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); + } + mc.occupancy[first] = CHUNK_SIZE as i32; + mc.occupancy[third] = CHUNK_SIZE as i32; + + assert!(mc.first_position(second, &mutations).is_none()); + assert!(mc.last_position(second, &mutations).is_none()); + assert!(mc.first_position(fourth, &mutations).is_none()); + assert!(mc.last_position(fourth, &mutations).is_none()); + assert_eq!( + mc.first_position(first, &mutations), + Some(forrustts::Position::try_from(0).unwrap()) + ); + assert_eq!( + mc.last_position(first, &mutations), + Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) + ); + assert_eq!( + mc.first_position(third, &mutations), + Some(forrustts::Position::try_from(CHUNK_SIZE as i64).unwrap()) + ); + assert_eq!( + mc.last_position(third, &mutations), + Some(forrustts::Position::try_from((2 * CHUNK_SIZE as i64) - 1).unwrap()) + ); + + // Okay, so now we can set up some haploid genomes + let mut haploid_genomes = HaploidGenomes::default(); + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(second as u32); + haploid_genomes.mutation_chunk_ids.push(third as u32); + haploid_genomes.mutation_chunk_ids.push(fourth as u32); + haploid_genomes.starts.push(0); + haploid_genomes.stops.push(4); + + haploid_genomes.move_empty_chunks(&mc); + assert_eq!(haploid_genomes.stops[0], 2); + + let genome = &haploid_genomes.mutation_chunk_ids + [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; + assert_eq!(genome.len(), 2); + let test_position = forrustts::Position::try_from(100).unwrap(); + let p = genome.partition_point(|&c| { + let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; + println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); + comp + }); + assert!(p > 0); + assert!(p == 1); + + let test_position = forrustts::Position::try_from(200).unwrap(); + let p = genome.partition_point(|&c| { + let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; + println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); + comp + }); + assert!(p > 0); + assert!(p == 2); + } } From 95e502a4221e0fe384e55140c145a38e877e1d62 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 12:21:10 -0700 Subject: [PATCH 39/72] test more complex cases --- src/genome_chunked_array.rs | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index d4b734e..76dfe70 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -174,8 +174,11 @@ fn remove_chunk_if_empty( }; let f = first; for i in f..mutation_chunk_ids.len() { - if !mutation_chunks.is_empty(i) { - mutation_chunk_ids[first] = mutation_chunk_ids[i]; + if !mutation_chunks.is_empty(mutation_chunk_ids[i] as usize) { + mutation_chunk_ids.swap(first, i); + //let temp = mutation_chunk_ids[first]; + //mutation_chunk_ids[first] = mutation_chunk_ids[i]; + //mutation_chunk_ids[i] = temp; first += 1; } } @@ -383,6 +386,7 @@ mod test_haploid_genomes { let second = mc.new_chunk(); let third = mc.new_chunk(); let fourth = mc.new_chunk(); + let fifth = mc.new_chunk(); let mut mutations = vec![]; for i in 0..CHUNK_SIZE { mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); @@ -394,8 +398,14 @@ mod test_haploid_genomes { mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); mc.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(2 * CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mc.mutation_ids[fifth * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); + } mc.occupancy[first] = CHUNK_SIZE as i32; mc.occupancy[third] = CHUNK_SIZE as i32; + mc.occupancy[fifth] = CHUNK_SIZE as i32; assert!(mc.first_position(second, &mutations).is_none()); assert!(mc.last_position(second, &mutations).is_none()); @@ -417,6 +427,14 @@ mod test_haploid_genomes { mc.last_position(third, &mutations), Some(forrustts::Position::try_from((2 * CHUNK_SIZE as i64) - 1).unwrap()) ); + assert_eq!( + mc.first_position(fifth, &mutations), + Some(forrustts::Position::try_from(2 * CHUNK_SIZE as i64).unwrap()) + ); + assert_eq!( + mc.last_position(fifth, &mutations), + Some(forrustts::Position::try_from((3 * CHUNK_SIZE as i64) - 1).unwrap()) + ); // Okay, so now we can set up some haploid genomes let mut haploid_genomes = HaploidGenomes::default(); @@ -424,15 +442,17 @@ mod test_haploid_genomes { haploid_genomes.mutation_chunk_ids.push(second as u32); haploid_genomes.mutation_chunk_ids.push(third as u32); haploid_genomes.mutation_chunk_ids.push(fourth as u32); + haploid_genomes.mutation_chunk_ids.push(fifth as u32); haploid_genomes.starts.push(0); - haploid_genomes.stops.push(4); + haploid_genomes.stops.push(5); haploid_genomes.move_empty_chunks(&mc); - assert_eq!(haploid_genomes.stops[0], 2); + assert_eq!(haploid_genomes.stops[0], 3); let genome = &haploid_genomes.mutation_chunk_ids [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; - assert_eq!(genome.len(), 2); + assert_eq!(genome, &[0, 2, 4]); + assert_eq!(genome.len(), 3); let test_position = forrustts::Position::try_from(100).unwrap(); let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; @@ -449,6 +469,6 @@ mod test_haploid_genomes { comp }); assert!(p > 0); - assert!(p == 2); + assert!(p == 3); } } From 9a8db0d040aa3fc16d3005d002c6826dff686256 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 4 May 2023 12:30:01 -0700 Subject: [PATCH 40/72] more detailed tests --- src/genome_chunked_array.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 76dfe70..30b3acb 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -451,8 +451,11 @@ mod test_haploid_genomes { let genome = &haploid_genomes.mutation_chunk_ids [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; - assert_eq!(genome, &[0, 2, 4]); + assert_eq!(genome, &[first as u32, third as u32, fifth as u32]); assert_eq!(genome.len(), 3); + assert!(genome + .windows(2) + .all(|w| mutations[w[0] as usize].position() <= mutations[w[1] as usize].position())); let test_position = forrustts::Position::try_from(100).unwrap(); let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; From de1d446a53c3ccbbe5c7f5093182a21015c9eda3 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 8 May 2023 10:33:10 -0700 Subject: [PATCH 41/72] failing test of single crossover --- src/genome_chunked_array.rs | 124 ++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 7 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 30b3acb..212b364 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -198,9 +198,7 @@ impl HaploidGenomes { let ngenomes = self.starts.len(); for genome in 0..ngenomes { let chunks = &mut self.mutation_chunk_ids[self.starts[genome]..self.stops[genome]]; - println!("{chunks:?}"); let nremoved = remove_chunk_if_empty(chunks, mutation_chunks); - println!("{chunks:?}"); self.stops[0] = nremoved; } } @@ -358,11 +356,9 @@ mod test_haploid_genomes { let genome = &haploid_genomes.mutation_chunk_ids [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; - println!("{genome:?}"); let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).is_none() || mc.last_position(c as usize, &mutations).unwrap() < test_position; - println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); comp }); assert!(p > 0); @@ -372,7 +368,6 @@ mod test_haploid_genomes { let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).is_none() || mc.last_position(c as usize, &mutations).unwrap() < test_position; - println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); comp }); assert_eq!(p, 4); @@ -459,7 +454,6 @@ mod test_haploid_genomes { let test_position = forrustts::Position::try_from(100).unwrap(); let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; - println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); comp }); assert!(p > 0); @@ -468,10 +462,126 @@ mod test_haploid_genomes { let test_position = forrustts::Position::try_from(200).unwrap(); let p = genome.partition_point(|&c| { let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; - println!("{c} {:?}, {comp}", mc.last_position(c as usize, &mutations)); comp }); assert!(p > 0); assert!(p == 3); } } + +#[cfg(test)] +mod tdd_crossover_semantics { + use forrustts::Position; + + use super::*; + + fn single_crossover( + genomes: (usize, usize), + breakpoint: Position, + mutations: &[Mutation], + mutation_chunks: &MutationChunks, + haploid_genomes: &HaploidGenomes, + output: &mut Vec, + ) { + let genome0 = &haploid_genomes.mutation_chunk_ids + [haploid_genomes.starts[genomes.0]..haploid_genomes.stops[genomes.0]]; + let genome1 = &haploid_genomes.mutation_chunk_ids + [haploid_genomes.starts[genomes.1]..haploid_genomes.stops[genomes.1]]; + + let p = genome0.partition_point(|&chunk| { + let comp = mutation_chunks + .last_position(chunk as usize, &mutations) + .unwrap() + < breakpoint; + comp + }); + output.extend_from_slice(&genome0[0..p]); + let p = genome1.partition_point(|&chunk| { + let comp = mutation_chunks + .last_position(chunk as usize, &mutations) + .unwrap() + < breakpoint; + comp + }); + output.extend_from_slice(&genome1[p..]); + } + + #[test] + fn test_simple_merge() { + let mut mutation_chunks = MutationChunks::default(); + let first = mutation_chunks.new_chunk(); + let second = mutation_chunks.new_chunk(); + let third = mutation_chunks.new_chunk(); + let mut mutations = vec![]; + for i in 0..CHUNK_SIZE { + mutation_chunks.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + let pos = i64::try_from(i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mutation_chunks.mutation_ids[second * CHUNK_SIZE + i] = + (mutations.len() - 1).try_into().unwrap(); + } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(2 * CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mutation_chunks.mutation_ids[third * CHUNK_SIZE + i] = + (mutations.len() - 1).try_into().unwrap(); + } + mutation_chunks.occupancy.fill(CHUNK_SIZE as i32); + assert_eq!(mutations.len(), 3 * CHUNK_SIZE); + let mut haploid_genomes = HaploidGenomes::default(); + + // make first genome + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(second as u32); + haploid_genomes.starts.push(0); + haploid_genomes.stops.push(2); + // make second genome + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(third as u32); + haploid_genomes.starts.push(2); + haploid_genomes.stops.push(4); + + let breakpoint = Position::try_from(CHUNK_SIZE as i64).unwrap(); + let mut output = vec![]; + single_crossover( + (0, 1), + breakpoint, + &mutations, + &mutation_chunks, + &haploid_genomes, + &mut output, + ); + println!("{output:?}"); + assert_eq!(output, &[0, 2]); + + let breakpoint = Position::try_from(2 * CHUNK_SIZE as i64).unwrap(); + let mut output = vec![]; + single_crossover( + (0, 1), + breakpoint, + &mutations, + &mutation_chunks, + &haploid_genomes, + &mut output, + ); + println!("{output:?}"); + assert_eq!(output, &[0, 1]); + + let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); + let mut output = vec![]; + single_crossover( + (0, 1), + breakpoint, + &mutations, + &mutation_chunks, + &haploid_genomes, + &mut output, + ); + println!("{output:?}"); + assert_eq!(output, &[0, 3]); + } +} From 6ed8354d50c4fe3124456c4eed2372c04a04de39 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 8 May 2023 10:37:39 -0700 Subject: [PATCH 42/72] notes on what we need. --- src/genome_chunked_array.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 212b364..78985d0 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -488,6 +488,13 @@ mod tdd_crossover_semantics { let genome1 = &haploid_genomes.mutation_chunk_ids [haploid_genomes.starts[genomes.1]..haploid_genomes.stops[genomes.1]]; + // What we are doing wrong here: + // 1. We are blindly copying over from + // one genome to another. + // 2. What we actaully SHOULD do + // is find if breakpoint exists + // WITHIN the chunks, and merge chunks if so + // 3. Then, update the index pointers by 1 let p = genome0.partition_point(|&chunk| { let comp = mutation_chunks .last_position(chunk as usize, &mutations) From e13e854fbe3cced440c0fd8d87340ad64dbf0c70 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 8 May 2023 10:41:29 -0700 Subject: [PATCH 43/72] need mutability here, I think? --- src/genome_chunked_array.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 78985d0..36c7a8c 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -479,8 +479,8 @@ mod tdd_crossover_semantics { genomes: (usize, usize), breakpoint: Position, mutations: &[Mutation], - mutation_chunks: &MutationChunks, haploid_genomes: &HaploidGenomes, + mutation_chunks: &mut MutationChunks, output: &mut Vec, ) { let genome0 = &haploid_genomes.mutation_chunk_ids @@ -558,8 +558,8 @@ mod tdd_crossover_semantics { (0, 1), breakpoint, &mutations, - &mutation_chunks, &haploid_genomes, + &mut mutation_chunks, &mut output, ); println!("{output:?}"); @@ -571,8 +571,8 @@ mod tdd_crossover_semantics { (0, 1), breakpoint, &mutations, - &mutation_chunks, &haploid_genomes, + &mut mutation_chunks, &mut output, ); println!("{output:?}"); @@ -584,8 +584,8 @@ mod tdd_crossover_semantics { (0, 1), breakpoint, &mutations, - &mutation_chunks, &haploid_genomes, + &mut mutation_chunks, &mut output, ); println!("{output:?}"); From 00fbc28120773b37fd215e3feba8c218909037df Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 8 May 2023 14:19:48 -0700 Subject: [PATCH 44/72] start to work on checking if we need new chunk --- src/genome_chunked_array.rs | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 36c7a8c..8aa892a 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -495,22 +495,43 @@ mod tdd_crossover_semantics { // is find if breakpoint exists // WITHIN the chunks, and merge chunks if so // 3. Then, update the index pointers by 1 - let p = genome0.partition_point(|&chunk| { + let p0 = genome0.partition_point(|&chunk| { let comp = mutation_chunks .last_position(chunk as usize, &mutations) .unwrap() < breakpoint; comp }); - output.extend_from_slice(&genome0[0..p]); - let p = genome1.partition_point(|&chunk| { + + let final_pos = if p0 < genome0.len() + && breakpoint + < mutation_chunks + .last_position(genome0[p0] as usize, mutations) + .unwrap() + { + let chunk_id = genome0[p0] as usize; + match mutation_chunks.mutation_ids[chunk_id * CHUNK_SIZE..(chunk_id + 1) * CHUNK_SIZE] + .iter() + .position(|&m| mutations[m as usize].position() < breakpoint) + { + Some(index) => index, + None => CHUNK_SIZE, + } + } else { + CHUNK_SIZE + }; + + todo!("if final_pos < CHUNK_SIZE, then we need a new chunk to work with."); + + output.extend_from_slice(&genome0[0..p0]); + let p1 = genome1.partition_point(|&chunk| { let comp = mutation_chunks .last_position(chunk as usize, &mutations) .unwrap() < breakpoint; comp }); - output.extend_from_slice(&genome1[p..]); + output.extend_from_slice(&genome1[p1..]); } #[test] From ffeb38240af1607bc2776788b3844ed2862f908a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 12 May 2023 16:03:36 -0700 Subject: [PATCH 45/72] another todo --- src/genome_chunked_array.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 8aa892a..221254a 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -503,6 +503,10 @@ mod tdd_crossover_semantics { comp }); + println!("{genome0:?}, {p0}, {breakpoint:?}"); + + todo!("if the insertion point is >= end of the genome, we are done?"); + let final_pos = if p0 < genome0.len() && breakpoint < mutation_chunks From df54097a4753bb48160568366a1014ff0df62969 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 11:45:12 -0700 Subject: [PATCH 46/72] do one todo --- src/genome_chunked_array.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 221254a..58f0ad2 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -505,7 +505,10 @@ mod tdd_crossover_semantics { println!("{genome0:?}, {p0}, {breakpoint:?}"); - todo!("if the insertion point is >= end of the genome, we are done?"); + if p0 == genome0.len() { + output.extend_from_slice(&genome0[0..p0]); + return; + } let final_pos = if p0 < genome0.len() && breakpoint From 95b91454b99620f044c97f6a0c09770f5f38871e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 12:47:56 -0700 Subject: [PATCH 47/72] fiddle with the error checking a bit --- src/genome_chunked_array.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 58f0ad2..7a27c04 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -516,6 +516,7 @@ mod tdd_crossover_semantics { .last_position(genome0[p0] as usize, mutations) .unwrap() { + println!("chunk = {}", genome0[p0]); let chunk_id = genome0[p0] as usize; match mutation_chunks.mutation_ids[chunk_id * CHUNK_SIZE..(chunk_id + 1) * CHUNK_SIZE] .iter() @@ -528,8 +529,11 @@ mod tdd_crossover_semantics { CHUNK_SIZE }; - todo!("if final_pos < CHUNK_SIZE, then we need a new chunk to work with."); - + if final_pos < CHUNK_SIZE { + todo!( + "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" + ); + } output.extend_from_slice(&genome0[0..p0]); let p1 = genome1.partition_point(|&chunk| { let comp = mutation_chunks @@ -606,6 +610,7 @@ mod tdd_crossover_semantics { println!("{output:?}"); assert_eq!(output, &[0, 1]); + println!("last..."); let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; single_crossover( From 53ee0f8fcab420818532db911c896d12c6acc01e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 12:50:24 -0700 Subject: [PATCH 48/72] and we are gettign the wrong answers from the previous loop... --- src/genome_chunked_array.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 7a27c04..29f7b80 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -530,6 +530,10 @@ mod tdd_crossover_semantics { }; if final_pos < CHUNK_SIZE { + for i in &mutation_chunks.mutation_ids[p0 * CHUNK_SIZE..(p0 + 1) * CHUNK_SIZE] { + println!("{i}"); + } + todo!( "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" ); From 8d47cab98fe74ee8488e2a04de641f54eeb89784 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 12:51:34 -0700 Subject: [PATCH 49/72] and we are gettign the wrong answers from the previous loop... --- src/genome_chunked_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 29f7b80..7652b0e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -530,7 +530,7 @@ mod tdd_crossover_semantics { }; if final_pos < CHUNK_SIZE { - for i in &mutation_chunks.mutation_ids[p0 * CHUNK_SIZE..(p0 + 1) * CHUNK_SIZE] { + for i in &mutation_chunks.mutation_ids[(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize)+ 1) * CHUNK_SIZE] { println!("{i}"); } From 0cd5dffb937e4d37cd625785dc7a3f69d147f4c3 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 14:32:21 -0700 Subject: [PATCH 50/72] split tests --- src/genome_chunked_array.rs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 7652b0e..a57f9c2 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -530,7 +530,9 @@ mod tdd_crossover_semantics { }; if final_pos < CHUNK_SIZE { - for i in &mutation_chunks.mutation_ids[(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize)+ 1) * CHUNK_SIZE] { + for i in &mutation_chunks.mutation_ids + [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] + { println!("{i}"); } @@ -549,8 +551,7 @@ mod tdd_crossover_semantics { output.extend_from_slice(&genome1[p1..]); } - #[test] - fn test_simple_merge() { + fn simple_merge_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { let mut mutation_chunks = MutationChunks::default(); let first = mutation_chunks.new_chunk(); let second = mutation_chunks.new_chunk(); @@ -588,6 +589,13 @@ mod tdd_crossover_semantics { haploid_genomes.starts.push(2); haploid_genomes.stops.push(4); + (mutation_chunks, mutations, haploid_genomes) + } + + #[test] + fn test_simple_merge_1() { + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); + let breakpoint = Position::try_from(CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; single_crossover( @@ -600,7 +608,11 @@ mod tdd_crossover_semantics { ); println!("{output:?}"); assert_eq!(output, &[0, 2]); + } + #[test] + fn test_simple_merge_2() { + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); let breakpoint = Position::try_from(2 * CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; single_crossover( @@ -613,7 +625,11 @@ mod tdd_crossover_semantics { ); println!("{output:?}"); assert_eq!(output, &[0, 1]); + } + #[test] + fn test_simple_merge_3() { + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); println!("last..."); let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; From d403c60dad376fb6a1c123e6794a12f2dc437e34 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Thu, 18 May 2023 14:44:47 -0700 Subject: [PATCH 51/72] some more print help --- src/genome_chunked_array.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index a57f9c2..c174aa7 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -487,6 +487,8 @@ mod tdd_crossover_semantics { [haploid_genomes.starts[genomes.0]..haploid_genomes.stops[genomes.0]]; let genome1 = &haploid_genomes.mutation_chunk_ids [haploid_genomes.starts[genomes.1]..haploid_genomes.stops[genomes.1]]; + println!("genome0 = {genome0:?}"); + println!("genome1 = {genome1:?}"); // What we are doing wrong here: // 1. We are blindly copying over from @@ -529,13 +531,20 @@ mod tdd_crossover_semantics { CHUNK_SIZE }; - if final_pos < CHUNK_SIZE { + println!("final_pos = {final_pos}"); for i in &mutation_chunks.mutation_ids [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] { println!("{i}"); } + if final_pos < CHUNK_SIZE { + //for i in &mutation_chunks.mutation_ids + // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] + //{ + // println!("{i}"); + //} + todo!( "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" ); @@ -606,7 +615,6 @@ mod tdd_crossover_semantics { &mut mutation_chunks, &mut output, ); - println!("{output:?}"); assert_eq!(output, &[0, 2]); } From a0ca99105708498ebe4f6c5e167d583f25f0130c Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 09:50:54 -0700 Subject: [PATCH 52/72] start to abstract out inner ops --- src/genome_chunked_array.rs | 74 ++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index c174aa7..4aaa85b 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -475,6 +475,43 @@ mod tdd_crossover_semantics { use super::*; + fn mutation_positions(s: &[u32], mutations: &[Mutation]) -> Vec { + s.iter() + .map(|&k| mutations[k as usize].position()) + .collect() + } + + enum InsertionType { + Before, + Within(usize), + } + + fn get_insertion_type( + mutation_chunks: &mut MutationChunks, + mutations: &[Mutation], + chunk: usize, + position: Position, + ) -> InsertionType { + assert!(mutation_chunks.occupancy(chunk) > 0); + let s = &mutation_chunks.mutation_ids[chunk * CHUNK_SIZE..(chunk + 1) * CHUNK_SIZE]; + if mutations[s[0] as usize].position() >= position { + InsertionType::Before + } else { + let index = match s + .iter() + .position(|&k| mutations[k as usize].position() < position) + { + Some(index) => index, + None => panic!( + "position must be within chunk {chunk}, {:?}, {:?}", + position, + mutation_positions(s, mutations) + ), + }; + InsertionType::Within(index) + } + } + fn single_crossover( genomes: (usize, usize), breakpoint: Position, @@ -493,7 +530,7 @@ mod tdd_crossover_semantics { // What we are doing wrong here: // 1. We are blindly copying over from // one genome to another. - // 2. What we actaully SHOULD do + // 2. What we actualy SHOULD do // is find if breakpoint exists // WITHIN the chunks, and merge chunks if so // 3. Then, update the index pointers by 1 @@ -532,23 +569,32 @@ mod tdd_crossover_semantics { }; println!("final_pos = {final_pos}"); - for i in &mutation_chunks.mutation_ids - [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] - { - println!("{i}"); - } - - if final_pos < CHUNK_SIZE { - //for i in &mutation_chunks.mutation_ids - // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] - //{ - // println!("{i}"); - //} + for i in &mutation_chunks.mutation_ids + [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] + { + println!("{i}"); + } - todo!( + match get_insertion_type(mutation_chunks, mutations, p0, breakpoint) { + InsertionType::Before => {} + InsertionType::Within(i) => { + todo!( "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" ); + } } + + //if final_pos < CHUNK_SIZE { + // //for i in &mutation_chunks.mutation_ids + // // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] + // //{ + // // println!("{i}"); + // //} + + // todo!( + // "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" + // ); + //} output.extend_from_slice(&genome0[0..p0]); let p1 = genome1.partition_point(|&chunk| { let comp = mutation_chunks From 59306280c4cf037beb746e5b12a7f71b022ed429 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 09:56:35 -0700 Subject: [PATCH 53/72] fix closure --- src/genome_chunked_array.rs | 60 ++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 4aaa85b..85a1611 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -497,10 +497,10 @@ mod tdd_crossover_semantics { if mutations[s[0] as usize].position() >= position { InsertionType::Before } else { - let index = match s - .iter() - .position(|&k| mutations[k as usize].position() < position) - { + let index = match s.iter().position(|&k| { + println!("{:?} {:?}", mutations[k as usize].position(), position); + mutations[k as usize].position() >= position + }) { Some(index) => index, None => panic!( "position must be within chunk {chunk}, {:?}, {:?}", @@ -549,37 +549,37 @@ mod tdd_crossover_semantics { return; } - let final_pos = if p0 < genome0.len() - && breakpoint - < mutation_chunks - .last_position(genome0[p0] as usize, mutations) - .unwrap() - { - println!("chunk = {}", genome0[p0]); - let chunk_id = genome0[p0] as usize; - match mutation_chunks.mutation_ids[chunk_id * CHUNK_SIZE..(chunk_id + 1) * CHUNK_SIZE] - .iter() - .position(|&m| mutations[m as usize].position() < breakpoint) - { - Some(index) => index, - None => CHUNK_SIZE, - } - } else { - CHUNK_SIZE - }; - - println!("final_pos = {final_pos}"); - for i in &mutation_chunks.mutation_ids - [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] - { - println!("{i}"); - } + //let final_pos = if p0 < genome0.len() + // && breakpoint + // < mutation_chunks + // .last_position(genome0[p0] as usize, mutations) + // .unwrap() + //{ + // println!("chunk = {}", genome0[p0]); + // let chunk_id = genome0[p0] as usize; + // match mutation_chunks.mutation_ids[chunk_id * CHUNK_SIZE..(chunk_id + 1) * CHUNK_SIZE] + // .iter() + // .position(|&m| mutations[m as usize].position() < breakpoint) + // { + // Some(index) => index, + // None => CHUNK_SIZE, + // } + //} else { + // CHUNK_SIZE + //}; + + //println!("final_pos = {final_pos}"); + //for i in &mutation_chunks.mutation_ids + // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] + //{ + // println!("{i}"); + //} match get_insertion_type(mutation_chunks, mutations, p0, breakpoint) { InsertionType::Before => {} InsertionType::Within(i) => { todo!( - "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" + "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {i}, {CHUNK_SIZE}: {breakpoint:?}" ); } } From 8b1bd9b77baffa8fe9e93a08e7afaf705721c6f5 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 10:06:34 -0700 Subject: [PATCH 54/72] hit the borrow checker --- src/genome_chunked_array.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 85a1611..afebb6c 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -578,6 +578,16 @@ mod tdd_crossover_semantics { match get_insertion_type(mutation_chunks, mutations, p0, breakpoint) { InsertionType::Before => {} InsertionType::Within(i) => { + let new_chunk = mutation_chunks.new_chunk(); + let mut new_chunk_slice = &mut mutation_chunks.mutation_ids + [new_chunk * CHUNK_SIZE..(new_chunk + 1) * CHUNK_SIZE]; + for j in 0..i { + new_chunk_slice[j] = mutation_chunks.mutation_ids[p0 * CHUNK_SIZE + j]; + } + println!( + "new chunk = {:?}", + mutation_positions(new_chunk_slice, mutations) + ); todo!( "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {i}, {CHUNK_SIZE}: {breakpoint:?}" ); From 368312f7224a20c6c67b928ba69cc8684d127a36 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 10:11:22 -0700 Subject: [PATCH 55/72] solve borrow checker --- src/genome_chunked_array.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index afebb6c..4f2e11d 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -579,11 +579,12 @@ mod tdd_crossover_semantics { InsertionType::Before => {} InsertionType::Within(i) => { let new_chunk = mutation_chunks.new_chunk(); - let mut new_chunk_slice = &mut mutation_chunks.mutation_ids - [new_chunk * CHUNK_SIZE..(new_chunk + 1) * CHUNK_SIZE]; for j in 0..i { - new_chunk_slice[j] = mutation_chunks.mutation_ids[p0 * CHUNK_SIZE + j]; + mutation_chunks.mutation_ids[new_chunk * CHUNK_SIZE + j] = + mutation_chunks.mutation_ids[p0 * CHUNK_SIZE + j]; } + let new_chunk_slice = &mut mutation_chunks.mutation_ids + [new_chunk * CHUNK_SIZE..(new_chunk) * CHUNK_SIZE + i]; println!( "new chunk = {:?}", mutation_positions(new_chunk_slice, mutations) From ebcf1a14f3f833418040a5157c879f6af6acdad7 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:22:13 -0700 Subject: [PATCH 56/72] baby steps --- src/genome_chunked_array.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 4f2e11d..5cac318 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -589,6 +589,14 @@ mod tdd_crossover_semantics { "new chunk = {:?}", mutation_positions(new_chunk_slice, mutations) ); + // now, find where the other genome gets involved + let p1 = genome1.partition_point(|&chunk| { + let comp = mutation_chunks + .last_position(chunk as usize, &mutations) + .unwrap() + < breakpoint; + comp + }); todo!( "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {i}, {CHUNK_SIZE}: {breakpoint:?}" ); From ac93ef6139c8ee8c228dd2c9725e48f381e12f9e Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:32:03 -0700 Subject: [PATCH 57/72] outline next steps --- src/genome_chunked_array.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 5cac318..e4d7e4f 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -597,9 +597,12 @@ mod tdd_crossover_semantics { < breakpoint; comp }); - todo!( - "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {i}, {CHUNK_SIZE}: {breakpoint:?}" - ); + match get_insertion_type(mutation_chunks, mutations, p1, breakpoint) { + InsertionType::Before => { + todo!("need to do a bunch of back-filling")}, + InsertionType::Within(_) => { + todo!("need to do a bunch of back-filling")}, + } } } From 1350c1190d188156ba6e0e7ccca75b08ec78cebd Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:33:46 -0700 Subject: [PATCH 58/72] fmt --- src/genome_chunked_array.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index e4d7e4f..68b9b5e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -599,9 +599,11 @@ mod tdd_crossover_semantics { }); match get_insertion_type(mutation_chunks, mutations, p1, breakpoint) { InsertionType::Before => { - todo!("need to do a bunch of back-filling")}, + todo!("need to do a bunch of back-filling") + } InsertionType::Within(_) => { - todo!("need to do a bunch of back-filling")}, + todo!("need to do a bunch of back-filling") + } } } } From b3fb1be2c60d3dd9e34431357673ea443d0c95ff Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:34:31 -0700 Subject: [PATCH 59/72] lint --- src/genome_chunked_array.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 68b9b5e..4833a38 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -591,11 +591,10 @@ mod tdd_crossover_semantics { ); // now, find where the other genome gets involved let p1 = genome1.partition_point(|&chunk| { - let comp = mutation_chunks - .last_position(chunk as usize, &mutations) + mutation_chunks + .last_position(chunk as usize, mutations) .unwrap() - < breakpoint; - comp + < breakpoint }); match get_insertion_type(mutation_chunks, mutations, p1, breakpoint) { InsertionType::Before => { From 06ddd45de4a8e8a9d2e60293b15596ed33c24041 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:37:27 -0700 Subject: [PATCH 60/72] update other internal stuff that will need to abstract later --- src/genome_chunked_array.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 4833a38..4af2a2f 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -583,6 +583,7 @@ mod tdd_crossover_semantics { mutation_chunks.mutation_ids[new_chunk * CHUNK_SIZE + j] = mutation_chunks.mutation_ids[p0 * CHUNK_SIZE + j]; } + mutation_chunks.occupancy[new_chunk] = i as i32; let new_chunk_slice = &mut mutation_chunks.mutation_ids [new_chunk * CHUNK_SIZE..(new_chunk) * CHUNK_SIZE + i]; println!( @@ -598,10 +599,10 @@ mod tdd_crossover_semantics { }); match get_insertion_type(mutation_chunks, mutations, p1, breakpoint) { InsertionType::Before => { - todo!("need to do a bunch of back-filling") + todo!("need to do a bunch of back-filling from a before") } InsertionType::Within(_) => { - todo!("need to do a bunch of back-filling") + todo!("need to do a bunch of back-filling from a within") } } } From 5c406ae69f016ae16577f4c35c731d6eb54767f5 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:43:24 -0700 Subject: [PATCH 61/72] better todo msg --- src/genome_chunked_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 4af2a2f..b0b1d88 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -601,8 +601,8 @@ mod tdd_crossover_semantics { InsertionType::Before => { todo!("need to do a bunch of back-filling from a before") } - InsertionType::Within(_) => { - todo!("need to do a bunch of back-filling from a within") + InsertionType::Within(index) => { + todo!("need to do a bunch of back-filling from a within, {index}") } } } From da7ec7896554b5de2d4638b492324bf74e86980b Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 11:45:50 -0700 Subject: [PATCH 62/72] better better msg --- src/genome_chunked_array.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index b0b1d88..2ad2337 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -602,7 +602,10 @@ mod tdd_crossover_semantics { todo!("need to do a bunch of back-filling from a before") } InsertionType::Within(index) => { - todo!("need to do a bunch of back-filling from a within, {index}") + todo!( + "need to do a bunch of back-filling from a within, {index}, {p1}, {}", + genome1.len() + ) } } } From 865e44b44146b80445608c8933df17cdd96013fe Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 13:45:38 -0700 Subject: [PATCH 63/72] fast path to passing tests --- src/genome_chunked_array.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 2ad2337..79d379e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -602,10 +602,13 @@ mod tdd_crossover_semantics { todo!("need to do a bunch of back-filling from a before") } InsertionType::Within(index) => { - todo!( - "need to do a bunch of back-filling from a within, {index}, {p1}, {}", - genome1.len() - ) + for j in index..CHUNK_SIZE { + mutation_chunks.mutation_ids[new_chunk * CHUNK_SIZE + j] = + mutation_chunks.mutation_ids[p1 * CHUNK_SIZE + j]; + } + output.extend_from_slice(&genome0[0..p0]); + output.push(new_chunk as u32); + return; } } } From 5a6d452e6d6e8cabbdc63e8a544d7590305f2839 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 13:54:06 -0700 Subject: [PATCH 64/72] fill final output --- src/genome_chunked_array.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 79d379e..1409f1c 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -608,6 +608,12 @@ mod tdd_crossover_semantics { } output.extend_from_slice(&genome0[0..p0]); output.push(new_chunk as u32); + let new_chunk_slice = &mut mutation_chunks.mutation_ids + [new_chunk * CHUNK_SIZE..(new_chunk + 1) * CHUNK_SIZE]; + println!( + "new chunk = {:?}", + mutation_positions(new_chunk_slice, mutations) + ); return; } } From 4356db2c863094d40b34db88c3ecef4341cada54 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 13:58:03 -0700 Subject: [PATCH 65/72] rename --- src/genome_chunked_array.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 1409f1c..03e768e 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -642,7 +642,7 @@ mod tdd_crossover_semantics { output.extend_from_slice(&genome1[p1..]); } - fn simple_merge_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { + fn simple_merge_simple_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { let mut mutation_chunks = MutationChunks::default(); let first = mutation_chunks.new_chunk(); let second = mutation_chunks.new_chunk(); @@ -685,7 +685,7 @@ mod tdd_crossover_semantics { #[test] fn test_simple_merge_1() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); let breakpoint = Position::try_from(CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; @@ -702,7 +702,7 @@ mod tdd_crossover_semantics { #[test] fn test_simple_merge_2() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); let breakpoint = Position::try_from(2 * CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; single_crossover( @@ -719,7 +719,7 @@ mod tdd_crossover_semantics { #[test] fn test_simple_merge_3() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_test_setup(); + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); println!("last..."); let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); let mut output = vec![]; From 863c36b6163e3a4eccc56b47e48cfdb9b11484de Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 14:30:14 -0700 Subject: [PATCH 66/72] add failing test --- src/genome_chunked_array.rs | 67 ++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 03e768e..44a564a 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -497,10 +497,10 @@ mod tdd_crossover_semantics { if mutations[s[0] as usize].position() >= position { InsertionType::Before } else { - let index = match s.iter().position(|&k| { - println!("{:?} {:?}", mutations[k as usize].position(), position); - mutations[k as usize].position() >= position - }) { + let index = match s + .iter() + .position(|&k| mutations[k as usize].position() >= position) + { Some(index) => index, None => panic!( "position must be within chunk {chunk}, {:?}, {:?}", @@ -734,4 +734,63 @@ mod tdd_crossover_semantics { println!("{output:?}"); assert_eq!(output, &[0, 3]); } + + fn simple_merge_harder_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { + let mut mutation_chunks = MutationChunks::default(); + let first = mutation_chunks.new_chunk(); + let second = mutation_chunks.new_chunk(); + let third = mutation_chunks.new_chunk(); + let mut mutations = vec![]; + for i in 0..CHUNK_SIZE { + mutation_chunks.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); + let pos = i64::try_from(i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + } + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mutation_chunks.mutation_ids[second * CHUNK_SIZE + i] = + (mutations.len() - 1).try_into().unwrap(); + } + + for i in 0..CHUNK_SIZE { + let pos = i64::try_from(i + 5).unwrap(); + mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); + mutation_chunks.mutation_ids[third * CHUNK_SIZE + i] = + (mutations.len() - 1).try_into().unwrap(); + } + + mutation_chunks.occupancy.fill(CHUNK_SIZE as i32); + assert_eq!(mutations.len(), 3 * CHUNK_SIZE); + let mut haploid_genomes = HaploidGenomes::default(); + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(second as u32); + haploid_genomes.starts.push(0); + haploid_genomes.stops.push(2); + // make second genome + haploid_genomes.mutation_chunk_ids.push(first as u32); + haploid_genomes.mutation_chunk_ids.push(third as u32); + haploid_genomes.starts.push(2); + haploid_genomes.stops.push(4); + + (mutation_chunks, mutations, haploid_genomes) + } + + #[test] + fn test_harder_case() { + let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_harder_test_setup(); + println!("last..."); + let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); + let mut output = vec![]; + single_crossover( + (0, 1), + breakpoint, + &mutations, + &haploid_genomes, + &mut mutation_chunks, + &mut output, + ); + println!("{output:?}"); + assert_eq!(output, &[0, 3]); + } } From 49840e612d3ce3944b136a938c921a2cc404abc3 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 14:45:17 -0700 Subject: [PATCH 67/72] cleanz --- src/genome_chunked_array.rs | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 44a564a..a4e89ec 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -549,32 +549,6 @@ mod tdd_crossover_semantics { return; } - //let final_pos = if p0 < genome0.len() - // && breakpoint - // < mutation_chunks - // .last_position(genome0[p0] as usize, mutations) - // .unwrap() - //{ - // println!("chunk = {}", genome0[p0]); - // let chunk_id = genome0[p0] as usize; - // match mutation_chunks.mutation_ids[chunk_id * CHUNK_SIZE..(chunk_id + 1) * CHUNK_SIZE] - // .iter() - // .position(|&m| mutations[m as usize].position() < breakpoint) - // { - // Some(index) => index, - // None => CHUNK_SIZE, - // } - //} else { - // CHUNK_SIZE - //}; - - //println!("final_pos = {final_pos}"); - //for i in &mutation_chunks.mutation_ids - // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] - //{ - // println!("{i}"); - //} - match get_insertion_type(mutation_chunks, mutations, p0, breakpoint) { InsertionType::Before => {} InsertionType::Within(i) => { @@ -620,17 +594,6 @@ mod tdd_crossover_semantics { } } - //if final_pos < CHUNK_SIZE { - // //for i in &mutation_chunks.mutation_ids - // // [(genome0[p0]) as usize * CHUNK_SIZE..((genome0[p0] as usize) + 1) * CHUNK_SIZE] - // //{ - // // println!("{i}"); - // //} - - // todo!( - // "if final_pos < CHUNK_SIZE, then we need a new chunk to work with: {final_pos}, {CHUNK_SIZE}: {breakpoint:?}" - // ); - //} output.extend_from_slice(&genome0[0..p0]); let p1 = genome1.partition_point(|&chunk| { let comp = mutation_chunks From 8d7c74593167398d7e9c53fc6fc2790b6e107e0a Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 14:47:09 -0700 Subject: [PATCH 68/72] cleanz --- src/genome_chunked_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index a4e89ec..44cd61f 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -717,7 +717,7 @@ mod tdd_crossover_semantics { } for i in 0..CHUNK_SIZE { - let pos = i64::try_from(i + 5).unwrap(); + let pos = i64::try_from(CHUNK_SIZE + i + 5).unwrap(); mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); mutation_chunks.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); From 540e5bae57064bac9d7e78d730b99f1ab01ba186 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Fri, 19 May 2023 14:48:00 -0700 Subject: [PATCH 69/72] better test setup, now failing as expected --- src/genome_chunked_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs index 44cd61f..ffd4d5b 100644 --- a/src/genome_chunked_array.rs +++ b/src/genome_chunked_array.rs @@ -754,6 +754,6 @@ mod tdd_crossover_semantics { &mut output, ); println!("{output:?}"); - assert_eq!(output, &[0, 3]); + assert_eq!(output, &[0, 3, 4]); } } From 10094ccf3304be0a63986ddf0999d443a4c71717 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 15 Apr 2024 14:24:04 -0700 Subject: [PATCH 70/72] start over, using Python prototype as guidance --- src/genome_chunked_array.rs | 759 ----------------------------------- src/genome_dynamic_chunks.rs | 42 ++ src/lib.rs | 2 +- 3 files changed, 43 insertions(+), 760 deletions(-) delete mode 100644 src/genome_chunked_array.rs create mode 100644 src/genome_dynamic_chunks.rs diff --git a/src/genome_chunked_array.rs b/src/genome_chunked_array.rs deleted file mode 100644 index ffd4d5b..0000000 --- a/src/genome_chunked_array.rs +++ /dev/null @@ -1,759 +0,0 @@ -//! This is a first draft of storing genomes -//! in "chunks" of a fixed length. A "chunk" -//! is a number of mutations. -//! -//! For this first pass, we will only support -//! efficient operations for non-overlapping -//! generations. Like genome_array.rs, each -//! new generation of genomes will be a new -//! container. Also like genome_array, this -//! design (probably) has efficiency issues -//! for overlapping generations. We will deal -//! with that later. - -use crate::common::Mutation; - -// Mutations will be stored in blocks of 64 -// indexes -const CHUNK_SIZE: usize = 64; - -// Pseudocode of what an OO -// version may look like -mod pseudocode { - struct Genome { - chunks: Vec, // indexes into DiploidPopulation::mutation_chunks::chunks - count: u32, - } - - struct MutationChunk { - mutations: [u32; super::CHUNK_SIZE], // indexes into DiploidPopulation::mutations - count: u32, - } - - impl MutationChunk { - fn new_empty() -> Self { - Self { - mutations: [u32::MAX; super::CHUNK_SIZE], - count: 0, - } - } - } - - struct MutationChunks { - chunks: Vec, - } - - struct DiploidPopulation { - genomes: Vec, - mutation_chunks: MutationChunks, - mutations: Vec, - } -} - -#[derive(Default)] -struct Mutations { - mutations: Vec, - counts: Vec, -} - -#[derive(Default)] -struct MutationChunks { - mutation_ids: Vec, // indexes into mutation vector. - counts: Vec, - // How many valid mutation ids are in a chunk. - // The rest must be the sentinel value u32::MAX - // NOTE: defining this as "how many" is akin to - // the len() of a Vec. That definition may - // make doing things like getting - // the last position out of a chunk awkward. - occupancy: Vec, - // indexes where chunks have count of 0. - // We can use these to "reallocate" new chunks. - queue: Vec, -} - -impl MutationChunks { - fn new_chunk(&mut self) -> usize { - match self.queue.pop() { - Some(index) => { - assert_eq!(self.counts[index], 0); - self.mutation_ids[index * CHUNK_SIZE..index * CHUNK_SIZE + CHUNK_SIZE] - .fill(u32::MAX); - self.occupancy[index] = 0; - index - } - None => { - let index = self.mutation_ids.len(); - self.mutation_ids.resize(index + CHUNK_SIZE, u32::MAX); - self.occupancy.push(0); - self.counts.push(0); - index / CHUNK_SIZE - } - } - } - - fn occupancy(&self, chunk: usize) -> i32 { - self.occupancy[chunk] - } - - fn is_empty(&self, chunk: usize) -> bool { - self.occupancy(chunk) == 0 - } - - fn fill_queue(&mut self) { - self.queue = self - .counts - .iter() - .enumerate() - .filter_map(|(i, c)| if c == &0 { Some(i) } else { None }) - .collect(); - } - - // Not clear that we really want this kind of logic. - // This fn may disappear later. - fn fill_from(&mut self, source: usize, destination: usize) { - assert_ne!(source, destination); - // NOTE: code like this could be a perf bottleneck. - // We may need to get each mutable sub-slice out - // using .split_at_mut() and some indexing. - // With the slices, iterators may be helpful - // in dodging boundary checks. - for i in 0..self.occupancy(source) { - self.mutation_ids[destination * CHUNK_SIZE + i as usize] = - self.mutation_ids[source * CHUNK_SIZE + i as usize]; - } - self.occupancy[destination] += self.occupancy[source]; - } - - fn position_details(&self, chunk: usize, f: F) -> Option - where - F: Fn(usize) -> forrustts::Position, - { - let o = self.occupancy(chunk); - match o { - x if x == 0 => None, - x if x > 0 && ((x as usize) <= CHUNK_SIZE) => Some(f(x as usize)), - _ => panic!("invalid occupancy value"), - } - } - - // In general, we don't want to mess w/empty chunks, - // other than when we FIRST create one. - // so we may revisit some of this logic later - fn first_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { - self.position_details(chunk, |_| { - mutations[self.mutation_ids[chunk * CHUNK_SIZE] as usize].position() - }) - } - - // In general, we don't want to mess w/empty chunks, - // other than when we FIRST create one. - // so we may revisit some of this logic later - fn last_position(&self, chunk: usize, mutations: &[Mutation]) -> Option { - self.position_details(chunk, |x| { - mutations[self.mutation_ids[chunk * CHUNK_SIZE + (x - 1)] as usize].position() - }) - } -} - -// This is a quick implementation of C++'s remove_if, which is a method -// of partitioning a slice such that retained items are STABLY moved to the -// front. -// NOTE: we can probably do this with generics later on -// NOTE: should the be associated w/HaploidGenomes? -fn remove_chunk_if_empty( - mutation_chunk_ids: &mut [u32], - mutation_chunks: &MutationChunks, -) -> usize { - let mut first = match mutation_chunk_ids - .iter() - .position(|&c| mutation_chunks.is_empty(c as usize)) - { - Some(index) => index, - None => mutation_chunk_ids.len(), - }; - let f = first; - for i in f..mutation_chunk_ids.len() { - if !mutation_chunks.is_empty(mutation_chunk_ids[i] as usize) { - mutation_chunk_ids.swap(first, i); - //let temp = mutation_chunk_ids[first]; - //mutation_chunk_ids[first] = mutation_chunk_ids[i]; - //mutation_chunk_ids[i] = temp; - first += 1; - } - } - first -} - -#[derive(Default)] -struct HaploidGenomes { - mutation_chunk_ids: Vec, // each genome is a CONTIGUOUS range of chunk indexes - starts: Vec, // For each genome, where is its first chunk? - stops: Vec, // One past last chunk such that a genome is mutation_chunk_ids[starts[i]..stops[i]] -} - -impl HaploidGenomes { - fn move_empty_chunks(&mut self, mutation_chunks: &MutationChunks) { - assert_eq!(self.starts.len(), self.stops.len()); - let ngenomes = self.starts.len(); - for genome in 0..ngenomes { - let chunks = &mut self.mutation_chunk_ids[self.starts[genome]..self.stops[genome]]; - let nremoved = remove_chunk_if_empty(chunks, mutation_chunks); - self.stops[0] = nremoved; - } - } -} - -#[cfg(test)] -mod test_mutation_chunks { - use super::Mutation; - use super::MutationChunks; - use super::CHUNK_SIZE; - - #[test] - fn test_add_chunk() { - let mut mc = MutationChunks::default(); - let nc = mc.new_chunk(); - assert_eq!(nc, 0); - assert_eq!(mc.occupancy(nc), 0); - } - - #[test] - fn test_recycle_chunk() { - let mut mc = MutationChunks::default(); - let nc = mc.new_chunk(); - assert_eq!(nc, 0); - assert!(mc.is_empty(nc)); - assert!(mc.last_position(nc, &[]).is_none()); - assert!(mc.first_position(nc, &[]).is_none()); - mc.fill_queue(); - let nc = mc.new_chunk(); - assert_eq!(nc, 0); - mc.counts[0] += 1; - mc.fill_queue(); - let nc = mc.new_chunk(); - assert_eq!(nc, 1); - mc.counts[1] += 1; - mc.fill_queue(); - let nc = mc.new_chunk(); - assert_eq!(nc, 2); - } - - #[test] - fn test_first_last_pos_full_chunk() { - let mut mc = MutationChunks::default(); - let first = mc.new_chunk(); - - // Make up some data - let mut mutations = vec![]; - for i in 0..CHUNK_SIZE { - mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - let pos = i64::try_from(i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - } - mc.occupancy[first] = CHUNK_SIZE as i32; - assert!(mc.first_position(first, &mutations).is_some()); - assert!(mc.last_position(first, &mutations).is_some()); - assert_eq!( - mc.first_position(first, &mutations), - Some(forrustts::Position::try_from(0_i64).unwrap()) - ); - assert_eq!( - mc.last_position(first, &mutations), - Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) - ); - } - - #[test] - fn test_fill_entire_chunk_from_another_full_chunk() { - let mut mc = MutationChunks::default(); - let first = mc.new_chunk(); - let second = mc.new_chunk(); - - // Make up some data - for i in 0..CHUNK_SIZE { - mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - } - mc.occupancy[first] = CHUNK_SIZE as i32; - - mc.fill_from(first, second); - let s = &mc.mutation_ids[second * CHUNK_SIZE..]; - for i in 0..CHUNK_SIZE { - assert_eq!(s[i], i as u32); - } - assert_eq!(mc.occupancy(second), CHUNK_SIZE as i32); - } -} - -#[cfg(test)] -mod test_haploid_genomes { - use super::*; - - // I STRONGLY DISLIKE the semantics here. - // This will lead to all sorts of edge cases - #[test] - fn test_search_thru_empty_chunk() { - let mut mc = MutationChunks::default(); - let first = mc.new_chunk(); - let second = mc.new_chunk(); - let third = mc.new_chunk(); - let fourth = mc.new_chunk(); - let mut mutations = vec![]; - for i in 0..CHUNK_SIZE { - mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - let pos = i64::try_from(i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mc.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); - } - mc.occupancy[first] = CHUNK_SIZE as i32; - mc.occupancy[third] = CHUNK_SIZE as i32; - - assert!(mc.first_position(second, &mutations).is_none()); - assert!(mc.last_position(second, &mutations).is_none()); - assert!(mc.first_position(fourth, &mutations).is_none()); - assert!(mc.last_position(fourth, &mutations).is_none()); - assert_eq!( - mc.first_position(first, &mutations), - Some(forrustts::Position::try_from(0).unwrap()) - ); - assert_eq!( - mc.last_position(first, &mutations), - Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) - ); - assert_eq!( - mc.first_position(third, &mutations), - Some(forrustts::Position::try_from(CHUNK_SIZE as i64).unwrap()) - ); - assert_eq!( - mc.last_position(third, &mutations), - Some(forrustts::Position::try_from((2 * CHUNK_SIZE as i64) - 1).unwrap()) - ); - - // Okay, so now we can set up some haploid genomes - let mut haploid_genomes = HaploidGenomes::default(); - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(second as u32); - haploid_genomes.mutation_chunk_ids.push(third as u32); - haploid_genomes.mutation_chunk_ids.push(fourth as u32); - haploid_genomes.starts.push(0); - haploid_genomes.stops.push(4); - - // Okay, now we have a genome with chunks that are full, empty, full. - // We envision this happening when, at the end of a generation, - // all mutations in chunk 1 (2nd chunk) become fixed. The idea is that - // we will mark all elements corresponding to fixed mutations with - // our sentinel value. - - // The goal of this test is to be able to correctly use partition_point - // for this kind of case. - let test_position = forrustts::Position::try_from(100).unwrap(); - assert!(test_position >= mc.first_position(third, &mutations).unwrap()); - assert!(test_position < mc.last_position(third, &mutations).unwrap()); - - let genome = &haploid_genomes.mutation_chunk_ids - [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; - let p = genome.partition_point(|&c| { - let comp = mc.last_position(c as usize, &mutations).is_none() - || mc.last_position(c as usize, &mutations).unwrap() < test_position; - comp - }); - assert!(p > 0); - assert_eq!(p, 2); - - let test_position = forrustts::Position::try_from(200).unwrap(); - let p = genome.partition_point(|&c| { - let comp = mc.last_position(c as usize, &mutations).is_none() - || mc.last_position(c as usize, &mutations).unwrap() < test_position; - comp - }); - assert_eq!(p, 4); - } - - // This leads to saner/safer semantics than the above - #[test] - fn purge_empty_chunks() { - let mut mc = MutationChunks::default(); - let first = mc.new_chunk(); - let second = mc.new_chunk(); - let third = mc.new_chunk(); - let fourth = mc.new_chunk(); - let fifth = mc.new_chunk(); - let mut mutations = vec![]; - for i in 0..CHUNK_SIZE { - mc.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - let pos = i64::try_from(i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mc.mutation_ids[third * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(2 * CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mc.mutation_ids[fifth * CHUNK_SIZE + i] = (mutations.len() - 1).try_into().unwrap(); - } - mc.occupancy[first] = CHUNK_SIZE as i32; - mc.occupancy[third] = CHUNK_SIZE as i32; - mc.occupancy[fifth] = CHUNK_SIZE as i32; - - assert!(mc.first_position(second, &mutations).is_none()); - assert!(mc.last_position(second, &mutations).is_none()); - assert!(mc.first_position(fourth, &mutations).is_none()); - assert!(mc.last_position(fourth, &mutations).is_none()); - assert_eq!( - mc.first_position(first, &mutations), - Some(forrustts::Position::try_from(0).unwrap()) - ); - assert_eq!( - mc.last_position(first, &mutations), - Some(forrustts::Position::try_from((CHUNK_SIZE as i64) - 1).unwrap()) - ); - assert_eq!( - mc.first_position(third, &mutations), - Some(forrustts::Position::try_from(CHUNK_SIZE as i64).unwrap()) - ); - assert_eq!( - mc.last_position(third, &mutations), - Some(forrustts::Position::try_from((2 * CHUNK_SIZE as i64) - 1).unwrap()) - ); - assert_eq!( - mc.first_position(fifth, &mutations), - Some(forrustts::Position::try_from(2 * CHUNK_SIZE as i64).unwrap()) - ); - assert_eq!( - mc.last_position(fifth, &mutations), - Some(forrustts::Position::try_from((3 * CHUNK_SIZE as i64) - 1).unwrap()) - ); - - // Okay, so now we can set up some haploid genomes - let mut haploid_genomes = HaploidGenomes::default(); - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(second as u32); - haploid_genomes.mutation_chunk_ids.push(third as u32); - haploid_genomes.mutation_chunk_ids.push(fourth as u32); - haploid_genomes.mutation_chunk_ids.push(fifth as u32); - haploid_genomes.starts.push(0); - haploid_genomes.stops.push(5); - - haploid_genomes.move_empty_chunks(&mc); - assert_eq!(haploid_genomes.stops[0], 3); - - let genome = &haploid_genomes.mutation_chunk_ids - [haploid_genomes.starts[0]..haploid_genomes.stops[0]]; - assert_eq!(genome, &[first as u32, third as u32, fifth as u32]); - assert_eq!(genome.len(), 3); - assert!(genome - .windows(2) - .all(|w| mutations[w[0] as usize].position() <= mutations[w[1] as usize].position())); - let test_position = forrustts::Position::try_from(100).unwrap(); - let p = genome.partition_point(|&c| { - let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; - comp - }); - assert!(p > 0); - assert!(p == 1); - - let test_position = forrustts::Position::try_from(200).unwrap(); - let p = genome.partition_point(|&c| { - let comp = mc.last_position(c as usize, &mutations).unwrap() < test_position; - comp - }); - assert!(p > 0); - assert!(p == 3); - } -} - -#[cfg(test)] -mod tdd_crossover_semantics { - use forrustts::Position; - - use super::*; - - fn mutation_positions(s: &[u32], mutations: &[Mutation]) -> Vec { - s.iter() - .map(|&k| mutations[k as usize].position()) - .collect() - } - - enum InsertionType { - Before, - Within(usize), - } - - fn get_insertion_type( - mutation_chunks: &mut MutationChunks, - mutations: &[Mutation], - chunk: usize, - position: Position, - ) -> InsertionType { - assert!(mutation_chunks.occupancy(chunk) > 0); - let s = &mutation_chunks.mutation_ids[chunk * CHUNK_SIZE..(chunk + 1) * CHUNK_SIZE]; - if mutations[s[0] as usize].position() >= position { - InsertionType::Before - } else { - let index = match s - .iter() - .position(|&k| mutations[k as usize].position() >= position) - { - Some(index) => index, - None => panic!( - "position must be within chunk {chunk}, {:?}, {:?}", - position, - mutation_positions(s, mutations) - ), - }; - InsertionType::Within(index) - } - } - - fn single_crossover( - genomes: (usize, usize), - breakpoint: Position, - mutations: &[Mutation], - haploid_genomes: &HaploidGenomes, - mutation_chunks: &mut MutationChunks, - output: &mut Vec, - ) { - let genome0 = &haploid_genomes.mutation_chunk_ids - [haploid_genomes.starts[genomes.0]..haploid_genomes.stops[genomes.0]]; - let genome1 = &haploid_genomes.mutation_chunk_ids - [haploid_genomes.starts[genomes.1]..haploid_genomes.stops[genomes.1]]; - println!("genome0 = {genome0:?}"); - println!("genome1 = {genome1:?}"); - - // What we are doing wrong here: - // 1. We are blindly copying over from - // one genome to another. - // 2. What we actualy SHOULD do - // is find if breakpoint exists - // WITHIN the chunks, and merge chunks if so - // 3. Then, update the index pointers by 1 - let p0 = genome0.partition_point(|&chunk| { - let comp = mutation_chunks - .last_position(chunk as usize, &mutations) - .unwrap() - < breakpoint; - comp - }); - - println!("{genome0:?}, {p0}, {breakpoint:?}"); - - if p0 == genome0.len() { - output.extend_from_slice(&genome0[0..p0]); - return; - } - - match get_insertion_type(mutation_chunks, mutations, p0, breakpoint) { - InsertionType::Before => {} - InsertionType::Within(i) => { - let new_chunk = mutation_chunks.new_chunk(); - for j in 0..i { - mutation_chunks.mutation_ids[new_chunk * CHUNK_SIZE + j] = - mutation_chunks.mutation_ids[p0 * CHUNK_SIZE + j]; - } - mutation_chunks.occupancy[new_chunk] = i as i32; - let new_chunk_slice = &mut mutation_chunks.mutation_ids - [new_chunk * CHUNK_SIZE..(new_chunk) * CHUNK_SIZE + i]; - println!( - "new chunk = {:?}", - mutation_positions(new_chunk_slice, mutations) - ); - // now, find where the other genome gets involved - let p1 = genome1.partition_point(|&chunk| { - mutation_chunks - .last_position(chunk as usize, mutations) - .unwrap() - < breakpoint - }); - match get_insertion_type(mutation_chunks, mutations, p1, breakpoint) { - InsertionType::Before => { - todo!("need to do a bunch of back-filling from a before") - } - InsertionType::Within(index) => { - for j in index..CHUNK_SIZE { - mutation_chunks.mutation_ids[new_chunk * CHUNK_SIZE + j] = - mutation_chunks.mutation_ids[p1 * CHUNK_SIZE + j]; - } - output.extend_from_slice(&genome0[0..p0]); - output.push(new_chunk as u32); - let new_chunk_slice = &mut mutation_chunks.mutation_ids - [new_chunk * CHUNK_SIZE..(new_chunk + 1) * CHUNK_SIZE]; - println!( - "new chunk = {:?}", - mutation_positions(new_chunk_slice, mutations) - ); - return; - } - } - } - } - - output.extend_from_slice(&genome0[0..p0]); - let p1 = genome1.partition_point(|&chunk| { - let comp = mutation_chunks - .last_position(chunk as usize, &mutations) - .unwrap() - < breakpoint; - comp - }); - output.extend_from_slice(&genome1[p1..]); - } - - fn simple_merge_simple_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { - let mut mutation_chunks = MutationChunks::default(); - let first = mutation_chunks.new_chunk(); - let second = mutation_chunks.new_chunk(); - let third = mutation_chunks.new_chunk(); - let mut mutations = vec![]; - for i in 0..CHUNK_SIZE { - mutation_chunks.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - let pos = i64::try_from(i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mutation_chunks.mutation_ids[second * CHUNK_SIZE + i] = - (mutations.len() - 1).try_into().unwrap(); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(2 * CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mutation_chunks.mutation_ids[third * CHUNK_SIZE + i] = - (mutations.len() - 1).try_into().unwrap(); - } - mutation_chunks.occupancy.fill(CHUNK_SIZE as i32); - assert_eq!(mutations.len(), 3 * CHUNK_SIZE); - let mut haploid_genomes = HaploidGenomes::default(); - - // make first genome - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(second as u32); - haploid_genomes.starts.push(0); - haploid_genomes.stops.push(2); - // make second genome - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(third as u32); - haploid_genomes.starts.push(2); - haploid_genomes.stops.push(4); - - (mutation_chunks, mutations, haploid_genomes) - } - - #[test] - fn test_simple_merge_1() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); - - let breakpoint = Position::try_from(CHUNK_SIZE as i64).unwrap(); - let mut output = vec![]; - single_crossover( - (0, 1), - breakpoint, - &mutations, - &haploid_genomes, - &mut mutation_chunks, - &mut output, - ); - assert_eq!(output, &[0, 2]); - } - - #[test] - fn test_simple_merge_2() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); - let breakpoint = Position::try_from(2 * CHUNK_SIZE as i64).unwrap(); - let mut output = vec![]; - single_crossover( - (0, 1), - breakpoint, - &mutations, - &haploid_genomes, - &mut mutation_chunks, - &mut output, - ); - println!("{output:?}"); - assert_eq!(output, &[0, 1]); - } - - #[test] - fn test_simple_merge_3() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_simple_test_setup(); - println!("last..."); - let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); - let mut output = vec![]; - single_crossover( - (0, 1), - breakpoint, - &mutations, - &haploid_genomes, - &mut mutation_chunks, - &mut output, - ); - println!("{output:?}"); - assert_eq!(output, &[0, 3]); - } - - fn simple_merge_harder_test_setup() -> (MutationChunks, Vec, HaploidGenomes) { - let mut mutation_chunks = MutationChunks::default(); - let first = mutation_chunks.new_chunk(); - let second = mutation_chunks.new_chunk(); - let third = mutation_chunks.new_chunk(); - let mut mutations = vec![]; - for i in 0..CHUNK_SIZE { - mutation_chunks.mutation_ids[first * CHUNK_SIZE + i] = i.try_into().unwrap(); - let pos = i64::try_from(i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - } - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(CHUNK_SIZE + i).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mutation_chunks.mutation_ids[second * CHUNK_SIZE + i] = - (mutations.len() - 1).try_into().unwrap(); - } - - for i in 0..CHUNK_SIZE { - let pos = i64::try_from(CHUNK_SIZE + i + 5).unwrap(); - mutations.push(Mutation::new(pos.try_into().unwrap(), vec![], 0.into())); - mutation_chunks.mutation_ids[third * CHUNK_SIZE + i] = - (mutations.len() - 1).try_into().unwrap(); - } - - mutation_chunks.occupancy.fill(CHUNK_SIZE as i32); - assert_eq!(mutations.len(), 3 * CHUNK_SIZE); - let mut haploid_genomes = HaploidGenomes::default(); - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(second as u32); - haploid_genomes.starts.push(0); - haploid_genomes.stops.push(2); - // make second genome - haploid_genomes.mutation_chunk_ids.push(first as u32); - haploid_genomes.mutation_chunk_ids.push(third as u32); - haploid_genomes.starts.push(2); - haploid_genomes.stops.push(4); - - (mutation_chunks, mutations, haploid_genomes) - } - - #[test] - fn test_harder_case() { - let (mut mutation_chunks, mutations, haploid_genomes) = simple_merge_harder_test_setup(); - println!("last..."); - let breakpoint = Position::try_from(10 + CHUNK_SIZE as i64).unwrap(); - let mut output = vec![]; - single_crossover( - (0, 1), - breakpoint, - &mutations, - &haploid_genomes, - &mut mutation_chunks, - &mut output, - ); - println!("{output:?}"); - assert_eq!(output, &[0, 3, 4]); - } -} diff --git a/src/genome_dynamic_chunks.rs b/src/genome_dynamic_chunks.rs new file mode 100644 index 0000000..b604266 --- /dev/null +++ b/src/genome_dynamic_chunks.rs @@ -0,0 +1,42 @@ +use crate::common::DiploidGenome; +use crate::common::Mutation; + +struct Chunk { + mutations: Vec, + num_mutations: usize, + count: u32, +} + +#[derive(Default, Clone)] +struct Genome { + chunks: Vec, +} + +pub struct DiploidPopulation { + chunks: Vec, + mutations: Vec, + mutation_counts: Vec, + individuals: Vec, + genomes: Vec, +} + +impl DiploidPopulation { + pub fn new(size: usize) -> Self { + let genomes = vec![Genome::default(); 2 * size]; + let individuals = vec![ + DiploidGenome { + first: 0, + second: 0 + }; + size + ]; + + Self { + genomes, + individuals, + mutations: vec![], + mutation_counts: vec![], + chunks: vec![], + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 1b548f1..d1c1d35 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,7 @@ mod common; pub mod fwdpp_copy; pub mod genome_array; -pub mod genome_chunked_array; +pub mod genome_dynamic_chunks; // Public "API" for examples pub use common::SimParams; From 2de5b31e5fe500614f0f579fe277683a85c9d278 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 15 Apr 2024 14:26:02 -0700 Subject: [PATCH 71/72] We need the chunk length --- src/genome_dynamic_chunks.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/genome_dynamic_chunks.rs b/src/genome_dynamic_chunks.rs index b604266..d3f951b 100644 --- a/src/genome_dynamic_chunks.rs +++ b/src/genome_dynamic_chunks.rs @@ -13,6 +13,7 @@ struct Genome { } pub struct DiploidPopulation { + chunk_length: usize, chunks: Vec, mutations: Vec, mutation_counts: Vec, @@ -21,7 +22,7 @@ pub struct DiploidPopulation { } impl DiploidPopulation { - pub fn new(size: usize) -> Self { + pub fn new(size: usize, chunk_length: usize) -> Self { let genomes = vec![Genome::default(); 2 * size]; let individuals = vec![ DiploidGenome { @@ -32,6 +33,7 @@ impl DiploidPopulation { ]; Self { + chunk_length, genomes, individuals, mutations: vec![], From 5b1b65582cb56b15200c4fe31a4a133f70c10f84 Mon Sep 17 00:00:00 2001 From: "Kevin R. Thornton" Date: Mon, 15 Apr 2024 15:45:02 -0700 Subject: [PATCH 72/72] start crossover fn --- src/genome_dynamic_chunks.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/genome_dynamic_chunks.rs b/src/genome_dynamic_chunks.rs index d3f951b..718213e 100644 --- a/src/genome_dynamic_chunks.rs +++ b/src/genome_dynamic_chunks.rs @@ -1,3 +1,7 @@ +use forrustts::genetics::GenerateBreakpoints; +use forrustts::genetics::GeneticMap; +use forrustts::prelude::*; + use crate::common::DiploidGenome; use crate::common::Mutation; @@ -42,3 +46,23 @@ impl DiploidPopulation { } } } + +fn crossover( + genome1: usize, + genome2: usize, + mutations: &Vec, + crossover_position: Vec, + new_mutations: Vec, + genomes: &mut Vec, + chunks: &mut Vec, +) -> Genome { + let mut current_genome = genome1; + let mut current_genome_index = 0_usize; + let mut current_genome_index_position = 0_usize; + + let mut new_chunks = Vec::::default(); + + todo!("not done"); + + Genome { chunks: new_chunks } +}