diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 5b49942..264815b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -281,8 +281,8 @@ pub trait BaseVector: Clone + Debug { let mut result = Self::zeros(n); - for i in 0..n { - result.set(i, self.get(index[i])); + for (i, idx) in index.iter().enumerate() { + result.set(i, self.get(*idx)); } result @@ -639,11 +639,11 @@ pub trait BaseMatrix: Clone + Debug { _ => Self::zeros(n, index.len()), }; - for i in 0..index.len() { + for (i, idx) in index.iter().enumerate() { for j in 0..k { match axis { - 0 => result.set(i, j, self.get(index[i], j)), - _ => result.set(j, i, self.get(j, index[i])), + 0 => result.set(i, j, self.get(*idx, j)), + _ => result.set(j, i, self.get(j, *idx)), }; } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index b85bbe8..ffb845c 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder; /// Logistic Regression parameters #[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LogisticRegressionParameters { -} +pub struct LogisticRegressionParameters {} /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] @@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters { - } + LogisticRegressionParameters {} } } @@ -231,7 +229,11 @@ impl> LogisticRegression { /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { + pub fn fit( + x: &M, + y: &M::RowVector, + _parameters: LogisticRegressionParameters, + ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 0fbe224..63827c4 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -1,30 +1,13 @@ //! # KFold //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. -//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. -//! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. -//! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! Defines k-fold cross validator. use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -/// An interface for the K-Folds cross-validator -pub trait BaseKFold { - /// An iterator over indices that split data into training and test set. - type Output: Iterator, Vec)>; - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Self::Output; - /// Returns the number of splits - fn n_splits(&self) -> usize; -} - /// K-Folds cross-validator pub struct KFold { /// Number of folds. Must be at least 2. @@ -101,12 +84,12 @@ impl KFold { } /// An iterator over indices that split data into training and test set. -pub struct BaseKFoldIter { +pub struct KFoldIter { indices: Vec, test_indices: Vec>, } -impl Iterator for BaseKFoldIter { +impl Iterator for KFoldIter { type Item = (Vec, Vec); fn next(&mut self) -> Option<(Vec, Vec)> { @@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter { /// Abstract class for all KFold functionalities impl BaseKFold for KFold { - type Output = BaseKFoldIter; + type Output = KFoldIter; fn n_splits(&self) -> usize { self.n_splits @@ -148,7 +131,7 @@ impl BaseKFold for KFold { let mut test_indices = self.test_masks(x); test_indices.reverse(); - BaseKFoldIter { + KFoldIter { indices, test_indices, } diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 64527b3..0aabb97 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -14,15 +14,27 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -pub mod kfold; +pub(crate) mod kfold; + +pub use kfold::{KFold, KFoldIter}; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. -/// * `y` - target values, should be of size _M_ +/// * `y` - target values, should be of size _N_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. /// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( @@ -65,22 +77,33 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// Cross validation results. #[derive(Clone, Debug)] pub struct CrossValidationResult { + /// Vector with test scores on each cv split pub test_score: Vec, + /// Vector with training scores on each cv split pub train_score: Vec, } impl CrossValidationResult { + /// Average test score pub fn mean_test_score(&self) -> T { self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() } - + /// Average training score pub fn mean_train_score(&self) -> T { self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } +/// Evaluate an estimator by cross-validation using given metric. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html) pub fn cross_validate( fit_estimator: F, x: &M, @@ -302,7 +325,6 @@ mod tests { #[test] fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], @@ -334,8 +356,15 @@ mod tests { ..KFold::default() }; - let results = - cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); + let results = cross_validate( + DecisionTreeClassifier::fit, + &x, + &y, + Default::default(), + cv, + &accuracy, + ) + .unwrap(); println!("{}", results.mean_test_score()); println!("{}", results.mean_train_score()); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 84d3fd1..849b8db 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -188,7 +188,7 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } -impl> Predictor for MultinomialNB { +impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) } diff --git a/src/svm/svc.rs b/src/svm/svc.rs index cbe97f7..aee4d3f 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -167,8 +167,8 @@ impl, K: Kernel> SVCParameters> Default for SVCParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVC { +impl, K: Kernel> Predictor + for SVC +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -743,10 +745,12 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), + SVCParameters::default() + .with_c(200.0) + .with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -784,7 +788,9 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), + SVCParameters::default() + .with_c(1.0) + .with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 25c7ff6..295ad78 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -134,7 +134,7 @@ struct Cache { data: Vec>>>, } -impl, K: Kernel> SVRParameters { +impl, K: Kernel> SVRParameters { /// Epsilon in the epsilon-SVR model. pub fn with_eps(mut self, eps: T) -> Self { self.eps = eps; @@ -153,11 +153,11 @@ impl, K: Kernel> SVRParameters>(&self, kernel: KK) -> SVRParameters { SVRParameters { - eps: self.eps, + eps: self.eps, c: self.c, tol: self.tol, - kernel: kernel, - m: PhantomData + kernel, + m: PhantomData, } } } @@ -169,12 +169,14 @@ impl> Default for SVRParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVR { +impl, K: Kernel> Predictor + for SVR +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -188,7 +190,7 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, + y: &M::RowVector, parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -544,13 +546,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = SVR::fit( - &x, - &y, - SVRParameters::default().with_eps(2.0).with_c(10.0), - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_squared_error(&y_hat, &y) < 2.5); }