diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 1895296..49938cf 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -13,6 +13,8 @@ extern crate rand; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; use rand::Rng; /// Splits data into 2 disjoint datasets. @@ -81,6 +83,113 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// +/// KFold Cross-Validation +/// +pub trait BaseKFold { + /// Returns integer indices corresponding to test sets + fn test_indices>(&self, x: &M) -> Vec>; + + /// Returns masksk corresponding to test sets + fn test_masks>(&self, x: &M) -> Vec>; + + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +} + +/// +/// An implementation of KFold +/// +pub struct KFold { + n_splits: usize, // cannot exceed std::usize::MAX + shuffle: bool, + // TODO: to be implemented later + // random_state: i32, +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3 as usize, + shuffle: true, + } + } +} + +/// +/// Abstract class for all KFold functionalities +/// +impl BaseKFold for KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle == true { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for i in 0..(n_samples % self.n_splits) { + fold_sizes[i] = fold_sizes[i] + 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } + + fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + + let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs + + for test_index in self.test_masks(x).drain(..) { + let train_index = indices + .clone() + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx] == false) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx] == true) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + return_values.push((train_index, test_index)) + } + return_values + } +} + #[cfg(test)] mod tests { @@ -106,4 +215,127 @@ mod tests { assert_eq!(x_train.shape().0, y_train.len()); assert_eq!(x_test.shape().0, y_test.len()); } + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits = k.split(&x); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits = k.split(&x); + + assert_eq!(train_test_splits[0].1.len(), 12 as usize); + assert_eq!(train_test_splits[0].0.len(), 11 as usize); + assert_eq!(train_test_splits[1].0.len(), 12 as usize); + assert_eq!(train_test_splits[1].1.len(), 11 as usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } }