Lmm/add seeds in more algorithms (#164)
* Provide better output in flaky tests * feat: add seed parameter to multiple algorithms * Update changelog Co-authored-by: Luis Moreno <morenol@users.noreply.github.com>
This commit is contained in:
@@ -5,8 +5,8 @@
|
||||
use crate::linalg::Matrix;
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::model_selection::BaseKFold;
|
||||
use crate::rand::get_rng_impl;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
/// K-Folds cross-validator
|
||||
pub struct KFold {
|
||||
@@ -14,6 +14,9 @@ pub struct KFold {
|
||||
pub n_splits: usize, // cannot exceed std::usize::MAX
|
||||
/// Whether to shuffle the data before splitting into batches
|
||||
pub shuffle: bool,
|
||||
/// When shuffle is True, seed affects the ordering of the indices.
|
||||
/// Which controls the randomness of each fold
|
||||
pub seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl KFold {
|
||||
@@ -23,8 +26,10 @@ impl KFold {
|
||||
|
||||
// initialise indices
|
||||
let mut indices: Vec<usize> = (0..n_samples).collect();
|
||||
let mut rng = get_rng_impl(self.seed);
|
||||
|
||||
if self.shuffle {
|
||||
indices.shuffle(&mut thread_rng());
|
||||
indices.shuffle(&mut rng);
|
||||
}
|
||||
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
|
||||
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
|
||||
@@ -66,6 +71,7 @@ impl Default for KFold {
|
||||
KFold {
|
||||
n_splits: 3,
|
||||
shuffle: true,
|
||||
seed: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,6 +87,12 @@ impl KFold {
|
||||
self.shuffle = shuffle;
|
||||
self
|
||||
}
|
||||
|
||||
/// When shuffle is True, random_state affects the ordering of the indices.
|
||||
pub fn with_seed(mut self, seed: Option<u64>) -> Self {
|
||||
self.seed = seed;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over indices that split data into training and test set.
|
||||
@@ -150,6 +162,7 @@ mod tests {
|
||||
let k = KFold {
|
||||
n_splits: 3,
|
||||
shuffle: false,
|
||||
seed: None,
|
||||
};
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
|
||||
let test_indices = k.test_indices(&x);
|
||||
@@ -165,6 +178,7 @@ mod tests {
|
||||
let k = KFold {
|
||||
n_splits: 3,
|
||||
shuffle: false,
|
||||
seed: None,
|
||||
};
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
|
||||
let test_indices = k.test_indices(&x);
|
||||
@@ -180,6 +194,7 @@ mod tests {
|
||||
let k = KFold {
|
||||
n_splits: 2,
|
||||
shuffle: false,
|
||||
seed: None,
|
||||
};
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||
let test_masks = k.test_masks(&x);
|
||||
@@ -206,6 +221,7 @@ mod tests {
|
||||
let k = KFold {
|
||||
n_splits: 2,
|
||||
shuffle: false,
|
||||
seed: None,
|
||||
};
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
|
||||
let train_test_splits: Vec<(Vec<usize>, Vec<usize>)> = k.split(&x).collect();
|
||||
@@ -238,6 +254,7 @@ mod tests {
|
||||
let k = KFold {
|
||||
n_splits: 3,
|
||||
shuffle: false,
|
||||
seed: None,
|
||||
};
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
|
||||
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
|
||||
//! ];
|
||||
//!
|
||||
//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true);
|
||||
//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true, None);
|
||||
//!
|
||||
//! println!("X train: {:?}, y train: {}, X test: {:?}, y test: {}",
|
||||
//! x_train.shape(), y_train.len(), x_test.shape(), y_test.len());
|
||||
@@ -107,8 +107,8 @@ use crate::error::Failed;
|
||||
use crate::linalg::BaseVector;
|
||||
use crate::linalg::Matrix;
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::rand::get_rng_impl;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
pub(crate) mod kfold;
|
||||
|
||||
@@ -130,11 +130,13 @@ pub trait BaseKFold {
|
||||
/// * `y` - target values, should be of size _N_
|
||||
/// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
|
||||
/// * `shuffle`, - whether or not to shuffle the data before splitting
|
||||
/// * `seed` - Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls
|
||||
pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
||||
x: &M,
|
||||
y: &M::RowVector,
|
||||
test_size: f32,
|
||||
shuffle: bool,
|
||||
seed: Option<u64>,
|
||||
) -> (M, M, M::RowVector, M::RowVector) {
|
||||
if x.shape().0 != y.len() {
|
||||
panic!(
|
||||
@@ -143,6 +145,7 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
||||
y.len()
|
||||
);
|
||||
}
|
||||
let mut rng = get_rng_impl(seed);
|
||||
|
||||
if test_size <= 0. || test_size > 1.0 {
|
||||
panic!("test_size should be between 0 and 1");
|
||||
@@ -159,7 +162,7 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
|
||||
let mut indices: Vec<usize> = (0..n).collect();
|
||||
|
||||
if shuffle {
|
||||
indices.shuffle(&mut thread_rng());
|
||||
indices.shuffle(&mut rng);
|
||||
}
|
||||
|
||||
let x_train = x.take(&indices[n_test..n], 0);
|
||||
@@ -292,7 +295,7 @@ mod tests {
|
||||
let x: DenseMatrix<f64> = DenseMatrix::rand(n, 3);
|
||||
let y = vec![0f64; n];
|
||||
|
||||
let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true);
|
||||
let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true, None);
|
||||
|
||||
assert!(
|
||||
x_train.shape().0 > (n as f64 * 0.65) as usize
|
||||
|
||||
Reference in New Issue
Block a user