KFold cross-validation (#8)
* Add documentation and API * Add public keyword * Implement test_indices (debug version) * Return indices as Vec of Vec * Consume vector using drain() * Use shape() to return num of samples * Implement test_masks * Implement KFold.split() * Make trait public * Add test for split * Fix samples in shape() * Implement shuffle * Simplify return values * Use usize for n_splits Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
@@ -13,6 +13,8 @@ extern crate rand;
|
|||||||
use crate::linalg::BaseVector;
|
use crate::linalg::BaseVector;
|
||||||
use crate::linalg::Matrix;
|
use crate::linalg::Matrix;
|
||||||
use crate::math::num::RealNumber;
|
use crate::math::num::RealNumber;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
|
|
||||||
/// Splits data into 2 disjoint datasets.
|
/// Splits data into 2 disjoint datasets.
|
||||||
@@ -81,6 +83,113 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
|||||||
(x_train, x_test, y_train, y_test)
|
(x_train, x_test, y_train, y_test)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// KFold Cross-Validation
|
||||||
|
///
|
||||||
|
pub trait BaseKFold {
|
||||||
|
/// Returns integer indices corresponding to test sets
|
||||||
|
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>>;
|
||||||
|
|
||||||
|
/// Returns masksk corresponding to test sets
|
||||||
|
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>>;
|
||||||
|
|
||||||
|
/// Return a tuple containing the the training set indices for that split and
|
||||||
|
/// the testing set indices for that split.
|
||||||
|
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// An implementation of KFold
|
||||||
|
///
|
||||||
|
pub struct KFold {
|
||||||
|
n_splits: usize, // cannot exceed std::usize::MAX
|
||||||
|
shuffle: bool,
|
||||||
|
// TODO: to be implemented later
|
||||||
|
// random_state: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for KFold {
|
||||||
|
fn default() -> KFold {
|
||||||
|
KFold {
|
||||||
|
n_splits: 3 as usize,
|
||||||
|
shuffle: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Abstract class for all KFold functionalities
|
||||||
|
///
|
||||||
|
impl BaseKFold for KFold {
|
||||||
|
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
|
||||||
|
// number of samples (rows) in the matrix
|
||||||
|
let n_samples: usize = x.shape().0;
|
||||||
|
|
||||||
|
// initialise indices
|
||||||
|
let mut indices: Vec<usize> = (0..n_samples).collect();
|
||||||
|
if self.shuffle == true {
|
||||||
|
indices.shuffle(&mut thread_rng());
|
||||||
|
}
|
||||||
|
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
|
||||||
|
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
|
||||||
|
|
||||||
|
// increment by one if odd
|
||||||
|
for i in 0..(n_samples % self.n_splits) {
|
||||||
|
fold_sizes[i] = fold_sizes[i] + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate the right array of arrays for test indices
|
||||||
|
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
|
||||||
|
let mut current: usize = 0;
|
||||||
|
for fold_size in fold_sizes.drain(..) {
|
||||||
|
let stop = current + fold_size;
|
||||||
|
return_values.push(indices[current..stop].to_vec());
|
||||||
|
current = stop
|
||||||
|
}
|
||||||
|
|
||||||
|
return_values
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
|
||||||
|
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
|
||||||
|
for test_index in self.test_indices(x).drain(..) {
|
||||||
|
// init mask
|
||||||
|
let mut test_mask = vec![false; x.shape().0];
|
||||||
|
// set mask's indices to true according to test indices
|
||||||
|
for i in test_index {
|
||||||
|
test_mask[i] = true; // can be implemented with map()
|
||||||
|
}
|
||||||
|
return_values.push(test_mask);
|
||||||
|
}
|
||||||
|
return_values
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)> {
|
||||||
|
let n_samples: usize = x.shape().0;
|
||||||
|
let indices: Vec<usize> = (0..n_samples).collect();
|
||||||
|
|
||||||
|
let mut return_values: Vec<(Vec<usize>, Vec<usize>)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs
|
||||||
|
|
||||||
|
for test_index in self.test_masks(x).drain(..) {
|
||||||
|
let train_index = indices
|
||||||
|
.clone()
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(idx, _)| test_index[idx] == false)
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.collect::<Vec<usize>>(); // filter train indices out according to mask
|
||||||
|
let test_index = indices
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(idx, _)| test_index[idx] == true)
|
||||||
|
.map(|(idx, _)| idx)
|
||||||
|
.collect::<Vec<usize>>(); // filter tests indices out according to mask
|
||||||
|
return_values.push((train_index, test_index))
|
||||||
|
}
|
||||||
|
return_values
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
@@ -106,4 +215,127 @@ mod tests {
|
|||||||
assert_eq!(x_train.shape().0, y_train.len());
|
assert_eq!(x_train.shape().0, y_train.len());
|
||||||
assert_eq!(x_test.shape().0, y_test.len());
|
assert_eq!(x_test.shape().0, y_test.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_indices_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
|
||||||
|
let test_indices = k.test_indices(&x);
|
||||||
|
|
||||||
|
assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_indices_odd() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
|
||||||
|
let test_indices = k.test_indices(&x);
|
||||||
|
|
||||||
|
assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_test_mask_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||||
|
let test_masks = k.test_masks(&x);
|
||||||
|
|
||||||
|
for t in &test_masks[0][0..11] {
|
||||||
|
// TODO: this can be prob done better
|
||||||
|
assert_eq!(*t, true)
|
||||||
|
}
|
||||||
|
for t in &test_masks[0][11..22] {
|
||||||
|
assert_eq!(*t, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
for t in &test_masks[1][0..11] {
|
||||||
|
assert_eq!(*t, false)
|
||||||
|
}
|
||||||
|
for t in &test_masks[1][11..22] {
|
||||||
|
assert_eq!(*t, true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_split_simple() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||||
|
let train_test_splits = k.split(&x);
|
||||||
|
|
||||||
|
assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
|
||||||
|
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_kfold_return_split_simple_shuffle() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 2,
|
||||||
|
..KFold::default()
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
|
||||||
|
let train_test_splits = k.split(&x);
|
||||||
|
|
||||||
|
assert_eq!(train_test_splits[0].1.len(), 12 as usize);
|
||||||
|
assert_eq!(train_test_splits[0].0.len(), 11 as usize);
|
||||||
|
assert_eq!(train_test_splits[1].0.len(), 12 as usize);
|
||||||
|
assert_eq!(train_test_splits[1].1.len(), 11 as usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn numpy_parity_test() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
shuffle: false,
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
||||||
|
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
||||||
|
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
||||||
|
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
||||||
|
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
||||||
|
];
|
||||||
|
for ((train, test), (expected_train, expected_test)) in
|
||||||
|
k.split(&x).into_iter().zip(expected)
|
||||||
|
{
|
||||||
|
assert_eq!(test, expected_test);
|
||||||
|
assert_eq!(train, expected_train);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn numpy_parity_test_shuffle() {
|
||||||
|
let k = KFold {
|
||||||
|
n_splits: 3,
|
||||||
|
..KFold::default()
|
||||||
|
};
|
||||||
|
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
||||||
|
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
||||||
|
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
|
||||||
|
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
|
||||||
|
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
|
||||||
|
];
|
||||||
|
for ((train, test), (expected_train, expected_test)) in
|
||||||
|
k.split(&x).into_iter().zip(expected)
|
||||||
|
{
|
||||||
|
assert_eq!(test.len(), expected_test.len());
|
||||||
|
assert_eq!(train.len(), expected_train.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user