KFold cross-validation (#8)

* Add documentation and API
* Add public keyword
* Implement test_indices (debug version)
* Return indices as Vec of Vec
* Consume vector using drain()
* Use shape() to return num of samples
* Implement test_masks
* Implement KFold.split()
* Make trait public
* Add test for split
* Fix samples in shape()
* Implement shuffle
* Simplify return values
* Use usize for n_splits
Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
Lorenzo
2020-10-13 10:10:28 +01:00
committed by GitHub
parent bb96354363
commit a2588f6f45
+232
View File
@@ -13,6 +13,8 @@ extern crate rand;
use crate::linalg::BaseVector; use crate::linalg::BaseVector;
use crate::linalg::Matrix; use crate::linalg::Matrix;
use crate::math::num::RealNumber; use crate::math::num::RealNumber;
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::Rng; use rand::Rng;
/// Splits data into 2 disjoint datasets. /// Splits data into 2 disjoint datasets.
@@ -81,6 +83,113 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
(x_train, x_test, y_train, y_test) (x_train, x_test, y_train, y_test)
} }
///
/// KFold Cross-Validation
///
pub trait BaseKFold {
/// Returns integer indices corresponding to test sets
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>>;
/// Returns masksk corresponding to test sets
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>>;
/// Return a tuple containing the the training set indices for that split and
/// the testing set indices for that split.
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)>;
}
///
/// An implementation of KFold
///
pub struct KFold {
n_splits: usize, // cannot exceed std::usize::MAX
shuffle: bool,
// TODO: to be implemented later
// random_state: i32,
}
impl Default for KFold {
fn default() -> KFold {
KFold {
n_splits: 3 as usize,
shuffle: true,
}
}
}
///
/// Abstract class for all KFold functionalities
///
impl BaseKFold for KFold {
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
// number of samples (rows) in the matrix
let n_samples: usize = x.shape().0;
// initialise indices
let mut indices: Vec<usize> = (0..n_samples).collect();
if self.shuffle == true {
indices.shuffle(&mut thread_rng());
}
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
// increment by one if odd
for i in 0..(n_samples % self.n_splits) {
fold_sizes[i] = fold_sizes[i] + 1;
}
// generate the right array of arrays for test indices
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
let mut current: usize = 0;
for fold_size in fold_sizes.drain(..) {
let stop = current + fold_size;
return_values.push(indices[current..stop].to_vec());
current = stop
}
return_values
}
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
for test_index in self.test_indices(x).drain(..) {
// init mask
let mut test_mask = vec![false; x.shape().0];
// set mask's indices to true according to test indices
for i in test_index {
test_mask[i] = true; // can be implemented with map()
}
return_values.push(test_mask);
}
return_values
}
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)> {
let n_samples: usize = x.shape().0;
let indices: Vec<usize> = (0..n_samples).collect();
let mut return_values: Vec<(Vec<usize>, Vec<usize>)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs
for test_index in self.test_masks(x).drain(..) {
let train_index = indices
.clone()
.iter()
.enumerate()
.filter(|&(idx, _)| test_index[idx] == false)
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter train indices out according to mask
let test_index = indices
.iter()
.enumerate()
.filter(|&(idx, _)| test_index[idx] == true)
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter tests indices out according to mask
return_values.push((train_index, test_index))
}
return_values
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
@@ -106,4 +215,127 @@ mod tests {
assert_eq!(x_train.shape().0, y_train.len()); assert_eq!(x_train.shape().0, y_train.len());
assert_eq!(x_test.shape().0, y_test.len()); assert_eq!(x_test.shape().0, y_test.len());
} }
#[test]
fn run_kfold_return_test_indices_simple() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
let test_indices = k.test_indices(&x);
assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_test_indices_odd() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
let test_indices = k.test_indices(&x);
assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_test_mask_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let test_masks = k.test_masks(&x);
for t in &test_masks[0][0..11] {
// TODO: this can be prob done better
assert_eq!(*t, true)
}
for t in &test_masks[0][11..22] {
assert_eq!(*t, false)
}
for t in &test_masks[1][0..11] {
assert_eq!(*t, false)
}
for t in &test_masks[1][11..22] {
assert_eq!(*t, true)
}
}
#[test]
fn run_kfold_return_split_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let train_test_splits = k.split(&x);
assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_split_simple_shuffle() {
let k = KFold {
n_splits: 2,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
let train_test_splits = k.split(&x);
assert_eq!(train_test_splits[0].1.len(), 12 as usize);
assert_eq!(train_test_splits[0].0.len(), 11 as usize);
assert_eq!(train_test_splits[1].0.len(), 12 as usize);
assert_eq!(train_test_splits[1].1.len(), 11 as usize);
}
#[test]
fn numpy_parity_test() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test, expected_test);
assert_eq!(train, expected_train);
}
}
#[test]
fn numpy_parity_test_shuffle() {
let k = KFold {
n_splits: 3,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test.len(), expected_test.len());
assert_eq!(train.len(), expected_train.len());
}
}
} }