Files
smartcore/src/model_selection/mod.rs
morenol 53351b2ece fix needless-range and clippy::ptr_arg warnings. (#36)
* Fix needless for loop range

* Do not ignore clippy::ptr_arg
2020-12-11 16:52:39 -04:00

341 lines
11 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! # Model Selection methods
//!
//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate),
//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data.
//! Overfitting is bad because the model we trained fits trained data too well and cant make any inferences on new data.
//! Underfitted is bad because the model is undetrained and does not fit the training data well.
//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for
//! your data.
//!
//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
use crate::linalg::BaseVector;
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::Rng;
/// Splits data into 2 disjoint datasets.
/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
/// * `y` - target values, should be of size _M_
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
x: &M,
y: &M::RowVector,
test_size: f32,
) -> (M, M, M::RowVector, M::RowVector) {
if x.shape().0 != y.len() {
panic!(
"x and y should have the same number of samples. |x|: {}, |y|: {}",
x.shape().0,
y.len()
);
}
if test_size <= 0. || test_size > 1.0 {
panic!("test_size should be between 0 and 1");
}
let n = y.len();
let m = x.shape().1;
let mut rng = rand::thread_rng();
let mut n_test = 0;
let mut index = vec![false; n];
for index_i in index.iter_mut().take(n) {
let p_test: f32 = rng.gen();
if p_test <= test_size {
*index_i = true;
n_test += 1;
}
}
let n_train = n - n_test;
let mut x_train = M::zeros(n_train, m);
let mut x_test = M::zeros(n_test, m);
let mut y_train = M::RowVector::zeros(n_train);
let mut y_test = M::RowVector::zeros(n_test);
let mut r_train = 0;
let mut r_test = 0;
for (r, index_r) in index.iter().enumerate().take(n) {
if *index_r {
//sample belongs to test
for c in 0..m {
x_test.set(r_test, c, x.get(r, c));
y_test.set(r_test, y.get(r));
}
r_test += 1;
} else {
for c in 0..m {
x_train.set(r_train, c, x.get(r, c));
y_train.set(r_train, y.get(r));
}
r_train += 1;
}
}
(x_train, x_test, y_train, y_test)
}
///
/// KFold Cross-Validation
///
pub trait BaseKFold {
/// Returns integer indices corresponding to test sets
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>>;
/// Returns masksk corresponding to test sets
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>>;
/// Return a tuple containing the the training set indices for that split and
/// the testing set indices for that split.
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)>;
}
///
/// An implementation of KFold
///
pub struct KFold {
n_splits: usize, // cannot exceed std::usize::MAX
shuffle: bool,
// TODO: to be implemented later
// random_state: i32,
}
impl Default for KFold {
fn default() -> KFold {
KFold {
n_splits: 3_usize,
shuffle: true,
}
}
}
///
/// Abstract class for all KFold functionalities
///
impl BaseKFold for KFold {
fn test_indices<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<usize>> {
// number of samples (rows) in the matrix
let n_samples: usize = x.shape().0;
// initialise indices
let mut indices: Vec<usize> = (0..n_samples).collect();
if self.shuffle {
indices.shuffle(&mut thread_rng());
}
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
// increment by one if odd
for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) {
*fold_size += 1;
}
// generate the right array of arrays for test indices
let mut return_values: Vec<Vec<usize>> = Vec::with_capacity(self.n_splits);
let mut current: usize = 0;
for fold_size in fold_sizes.drain(..) {
let stop = current + fold_size;
return_values.push(indices[current..stop].to_vec());
current = stop
}
return_values
}
fn test_masks<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<Vec<bool>> {
let mut return_values: Vec<Vec<bool>> = Vec::with_capacity(self.n_splits);
for test_index in self.test_indices(x).drain(..) {
// init mask
let mut test_mask = vec![false; x.shape().0];
// set mask's indices to true according to test indices
for i in test_index {
test_mask[i] = true; // can be implemented with map()
}
return_values.push(test_mask);
}
return_values
}
fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Vec<(Vec<usize>, Vec<usize>)> {
let n_samples: usize = x.shape().0;
let indices: Vec<usize> = (0..n_samples).collect();
let mut return_values: Vec<(Vec<usize>, Vec<usize>)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs
for test_index in self.test_masks(x).drain(..) {
let train_index = indices
.clone()
.iter()
.enumerate()
.filter(|&(idx, _)| !test_index[idx])
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter train indices out according to mask
let test_index = indices
.iter()
.enumerate()
.filter(|&(idx, _)| test_index[idx])
.map(|(idx, _)| idx)
.collect::<Vec<usize>>(); // filter tests indices out according to mask
return_values.push((train_index, test_index))
}
return_values
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::linalg::naive::dense_matrix::*;
#[test]
fn run_train_test_split() {
let n = 100;
let x: DenseMatrix<f64> = DenseMatrix::rand(100, 3);
let y = vec![0f64; 100];
let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2);
assert!(
x_train.shape().0 > (n as f64 * 0.65) as usize
&& x_train.shape().0 < (n as f64 * 0.95) as usize
);
assert!(
x_test.shape().0 > (n as f64 * 0.05) as usize
&& x_test.shape().0 < (n as f64 * 0.35) as usize
);
assert_eq!(x_train.shape().0, y_train.len());
assert_eq!(x_test.shape().0, y_test.len());
}
#[test]
fn run_kfold_return_test_indices_simple() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
let test_indices = k.test_indices(&x);
assert_eq!(test_indices[0], (0..11).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (11..22).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (22..33).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_test_indices_odd() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
let test_indices = k.test_indices(&x);
assert_eq!(test_indices[0], (0..12).collect::<Vec<usize>>());
assert_eq!(test_indices[1], (12..23).collect::<Vec<usize>>());
assert_eq!(test_indices[2], (23..34).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_test_mask_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let test_masks = k.test_masks(&x);
for t in &test_masks[0][0..11] {
// TODO: this can be prob done better
assert_eq!(*t, true)
}
for t in &test_masks[0][11..22] {
assert_eq!(*t, false)
}
for t in &test_masks[1][0..11] {
assert_eq!(*t, false)
}
for t in &test_masks[1][11..22] {
assert_eq!(*t, true)
}
}
#[test]
fn run_kfold_return_split_simple() {
let k = KFold {
n_splits: 2,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let train_test_splits = k.split(&x);
assert_eq!(train_test_splits[0].1, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[0].0, (11..22).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].0, (0..11).collect::<Vec<usize>>());
assert_eq!(train_test_splits[1].1, (11..22).collect::<Vec<usize>>());
}
#[test]
fn run_kfold_return_split_simple_shuffle() {
let k = KFold {
n_splits: 2,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(23, 100);
let train_test_splits = k.split(&x);
assert_eq!(train_test_splits[0].1.len(), 12_usize);
assert_eq!(train_test_splits[0].0.len(), 11_usize);
assert_eq!(train_test_splits[1].0.len(), 12_usize);
assert_eq!(train_test_splits[1].1.len(), 11_usize);
}
#[test]
fn numpy_parity_test() {
let k = KFold {
n_splits: 3,
shuffle: false,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test, expected_test);
assert_eq!(train, expected_train);
}
}
#[test]
fn numpy_parity_test_shuffle() {
let k = KFold {
n_splits: 3,
..KFold::default()
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]),
(vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]),
(vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]),
];
for ((train, test), (expected_train, expected_test)) in
k.split(&x).into_iter().zip(expected)
{
assert_eq!(test.len(), expected_test.len());
assert_eq!(train.len(), expected_train.len());
}
}
}