Lmm/add seeds in more algorithms (#164)

* Provide better output in flaky tests

* feat: add seed parameter to multiple algorithms

* Update changelog

Co-authored-by: Luis Moreno <morenol@users.noreply.github.com>
This commit is contained in:
morenol
2022-09-21 15:35:22 -04:00
committed by GitHub
parent 48514d1b15
commit 3a44161406
14 changed files with 139 additions and 64 deletions
+19 -2
View File
@@ -5,8 +5,8 @@
use crate::linalg::Matrix;
use crate::math::num::RealNumber;
use crate::model_selection::BaseKFold;
use crate::rand::get_rng_impl;
use rand::seq::SliceRandom;
use rand::thread_rng;
/// K-Folds cross-validator
pub struct KFold {
@@ -14,6 +14,9 @@ pub struct KFold {
pub n_splits: usize, // cannot exceed std::usize::MAX
/// Whether to shuffle the data before splitting into batches
pub shuffle: bool,
/// When shuffle is True, seed affects the ordering of the indices.
/// Which controls the randomness of each fold
pub seed: Option<u64>,
}
impl KFold {
@@ -23,8 +26,10 @@ impl KFold {
// initialise indices
let mut indices: Vec<usize> = (0..n_samples).collect();
let mut rng = get_rng_impl(self.seed);
if self.shuffle {
indices.shuffle(&mut thread_rng());
indices.shuffle(&mut rng);
}
// return a new array of given shape n_split, filled with each element of n_samples divided by n_splits.
let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits];
@@ -66,6 +71,7 @@ impl Default for KFold {
KFold {
n_splits: 3,
shuffle: true,
seed: None,
}
}
}
@@ -81,6 +87,12 @@ impl KFold {
self.shuffle = shuffle;
self
}
/// When shuffle is True, random_state affects the ordering of the indices.
pub fn with_seed(mut self, seed: Option<u64>) -> Self {
self.seed = seed;
self
}
}
/// An iterator over indices that split data into training and test set.
@@ -150,6 +162,7 @@ mod tests {
let k = KFold {
n_splits: 3,
shuffle: false,
seed: None,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(33, 100);
let test_indices = k.test_indices(&x);
@@ -165,6 +178,7 @@ mod tests {
let k = KFold {
n_splits: 3,
shuffle: false,
seed: None,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(34, 100);
let test_indices = k.test_indices(&x);
@@ -180,6 +194,7 @@ mod tests {
let k = KFold {
n_splits: 2,
shuffle: false,
seed: None,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let test_masks = k.test_masks(&x);
@@ -206,6 +221,7 @@ mod tests {
let k = KFold {
n_splits: 2,
shuffle: false,
seed: None,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(22, 100);
let train_test_splits: Vec<(Vec<usize>, Vec<usize>)> = k.split(&x).collect();
@@ -238,6 +254,7 @@ mod tests {
let k = KFold {
n_splits: 3,
shuffle: false,
seed: None,
};
let x: DenseMatrix<f64> = DenseMatrix::rand(10, 4);
let expected: Vec<(Vec<usize>, Vec<usize>)> = vec![