diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 2fe7792..d271ed6 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -6,6 +6,7 @@ //! use smartcore::algorithm::neighbour::cover_tree::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -453,7 +454,7 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize)] + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d09f2ed..45fbd6f 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -5,6 +5,7 @@ //! use smartcore::algorithm::neighbour::linear_search::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -137,6 +138,7 @@ mod tests { use super::*; use crate::math::distance::Distances; + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/base.rs b/src/base.rs new file mode 100644 index 0000000..a2d4468 --- /dev/null +++ b/src/base.rs @@ -0,0 +1,10 @@ +//! # Common Interfaces and methods +//! +//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. + +use crate::error::Failed; + +/// Implements method predict that offers a way to estimate target value from new data +pub trait Predictor { + fn predict(&self, x: &X) -> Result; +} diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 7229d92..a742d90 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -9,7 +9,7 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::ensemble::random_forest_classifier::*; +//! use smartcore::ensemble::random_forest_classifier::RandomForestClassifier; //! //! // Iris dataset //! let x = DenseMatrix::from_2d_array(&[ @@ -51,6 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -117,6 +118,12 @@ impl Default for RandomForestClassifierParameters { } } +impl> Predictor for RandomForestClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestClassifier { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 36fa096..52b39f9 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,6 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -106,6 +107,12 @@ impl PartialEq for RandomForestRegressor { } } +impl> Predictor for RandomForestRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestRegressor { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/lib.rs b/src/lib.rs index 9290c86..a1608c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,7 @@ //! let y = vec![2., 2., 2., 3., 3.]; //! //! // Train classifier -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! //! // Predict classes //! let y_hat = knn.predict(&x).unwrap(); @@ -71,6 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; +pub(crate) mod base; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c768cbf..5b49942 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -274,6 +274,19 @@ pub trait BaseVector: Clone + Debug { /// Copies content of `other` vector. fn copy_from(&mut self, other: &Self); + + /// Take elements from an array. + fn take(&self, index: &[usize]) -> Self { + let n = index.len(); + + let mut result = Self::zeros(n); + + for i in 0..n { + result.set(i, self.get(index[i])); + } + + result + } } /// Generic matrix type. @@ -611,6 +624,32 @@ pub trait BaseMatrix: Clone + Debug { /// Calculates the covariance matrix fn cov(&self) -> Self; + + /// Take elements from an array along an axis. + fn take(&self, index: &[usize], axis: u8) -> Self { + let (n, p) = self.shape(); + + let k = match axis { + 0 => p, + _ => n, + }; + + let mut result = match axis { + 0 => Self::zeros(index.len(), p), + _ => Self::zeros(n, index.len()), + }; + + for i in 0..index.len() { + for j in 0..k { + match axis { + 0 => result.set(i, j, self.get(index[i], j)), + _ => result.set(j, i, self.get(j, index[i])), + }; + } + } + + result + } } /// Generic matrix with additional mixins like various factorization methods. @@ -662,6 +701,8 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { #[cfg(test)] mod tests { + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; use crate::linalg::BaseVector; #[test] @@ -684,4 +725,35 @@ mod tests { assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); } + + #[test] + fn vec_take() { + let m = vec![1., 2., 3., 4., 5.]; + + assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]); + } + + #[test] + fn take() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[3.0, 4.0], + &[5.0, 6.0], + &[7.0, 8.0], + &[9.0, 10.0], + ]); + + let expected_0 = DenseMatrix::from_2d_array(&[&[3.0, 4.0], &[3.0, 4.0], &[7.0, 8.0]]); + + let expected_1 = DenseMatrix::from_2d_array(&[ + &[2.0, 1.0], + &[4.0, 3.0], + &[6.0, 5.0], + &[8.0, 7.0], + &[10.0, 9.0], + ]); + + assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); + assert_eq!(m.take(&vec!(1, 0), 1), expected_1); + } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 085fd5d..6ed40c8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -36,7 +36,7 @@ //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. //! ]); //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = lr.predict(&x).unwrap(); //! ``` use std::iter::Sum; @@ -917,7 +917,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]); - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index c01f3c7..b386290 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,6 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -66,7 +67,7 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -108,6 +109,12 @@ impl> PartialEq for ElasticNet { } } +impl> Predictor for ElasticNet { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> ElasticNet { /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 7395bdc..0dab3e5 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,6 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -33,7 +34,7 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -71,6 +72,12 @@ impl> PartialEq for Lasso { } } +impl> Predictor for Lasso { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> Lasso { /// Fits Lasso regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index d01b817..c7bd872 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,11 +64,12 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -78,7 +79,7 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, @@ -107,6 +108,12 @@ impl> PartialEq for LinearRegression { } } +impl> Predictor for LinearRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LinearRegression { /// Fits Linear Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 7b7cab6..b85bbe8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -40,7 +40,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` @@ -58,6 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -66,6 +67,11 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +/// Logistic Regression parameters +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct LogisticRegressionParameters { +} + /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] pub struct LogisticRegression> { @@ -97,6 +103,13 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl Default for LogisticRegressionParameters { + fn default() -> Self { + LogisticRegressionParameters { + } + } +} + impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { if self.num_classes != other.num_classes @@ -207,11 +220,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> Predictor for LogisticRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LogisticRegression { /// Fits Logistic Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values - pub fn fit(x: &M, y: &M::RowVector) -> Result, Failed> { + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); @@ -461,7 +481,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); assert_eq!(lr.coefficients().shape(), (3, 2)); assert_eq!(lr.intercept().shape(), (3, 1)); @@ -484,7 +504,7 @@ mod tests { let x = DenseMatrix::from_vec(15, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -498,7 +518,7 @@ mod tests { let x = DenseMatrix::from_vec(20, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -526,7 +546,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let deserialized_lr: LogisticRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); @@ -562,7 +582,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 98bc639..2b5a898 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -63,12 +63,13 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -78,7 +79,7 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -114,6 +115,12 @@ impl> PartialEq for RidgeRegression { } } +impl> Predictor for RidgeRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> RidgeRegression { /// Fits ridge regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index e292f9c..9034727 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -25,7 +25,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 4028259..129fe16 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -26,7 +26,7 @@ use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index fd320c3..84aa947 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -52,7 +52,7 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 66125a5..9a69184 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -24,7 +24,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index b7c5691..c5dd85d 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -28,7 +28,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 696b5ff..9bfbd6b 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -28,7 +28,7 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Distance metric, a function that calculates distance between two points -pub trait Distance { +pub trait Distance: Clone { /// Calculates distance between _a_ and _b_ fn distance(&self, a: &T, b: &T) -> F; } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index f49300d..42b3994 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -42,7 +42,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs new file mode 100644 index 0000000..0fbe224 --- /dev/null +++ b/src/model_selection/kfold.rs @@ -0,0 +1,286 @@ +//! # KFold +//! +//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), +//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. +//! Underfitted is bad because the model is undetrained and does not fit the training data well. +//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! your data. +//! +//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. + +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} + +/// K-Folds cross-validator +pub struct KFold { + /// Number of folds. Must be at least 2. + pub n_splits: usize, // cannot exceed std::usize::MAX + /// Whether to shuffle the data before splitting into batches + pub shuffle: bool, +} + +impl KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3, + shuffle: true, + } + } +} + +impl KFold { + /// Number of folds. Must be at least 2. + pub fn with_n_splits(mut self, n_splits: usize) -> Self { + self.n_splits = n_splits; + self + } + /// Whether to shuffle the data before splitting into batches + pub fn with_shuffle(mut self, shuffle: bool) -> Self { + self.shuffle = shuffle; + self + } +} + +/// An iterator over indices that split data into training and test set. +pub struct BaseKFoldIter { + indices: Vec, + test_indices: Vec>, +} + +impl Iterator for BaseKFoldIter { + type Item = (Vec, Vec); + + fn next(&mut self) -> Option<(Vec, Vec)> { + self.test_indices.pop().map(|test_index| { + let train_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| !test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + + (train_index, test_index) + }) + } +} + +/// Abstract class for all KFold functionalities +impl BaseKFold for KFold { + type Output = BaseKFoldIter; + + fn n_splits(&self) -> usize { + self.n_splits + } + + fn split>(&self, x: &M) -> Self::Output { + if self.n_splits < 2 { + panic!("Number of splits is too small: {}", self.n_splits); + } + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + let mut test_indices = self.test_masks(x); + test_indices.reverse(); + + BaseKFoldIter { + indices, + test_indices, + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1.len(), 12_usize); + assert_eq!(train_test_splits[0].0.len(), 11_usize); + assert_eq!(train_test_splits[1].0.len(), 12_usize); + assert_eq!(train_test_splits[1].1.len(), 11_usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } +} diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index bc0f9b8..64527b3 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,21 +9,27 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +use crate::base::Predictor; +use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -use rand::Rng; + +pub mod kfold; /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. /// * `y` - target values, should be of size _M_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. +/// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( x: &M, y: &M::RowVector, test_size: f32, + shuffle: bool, ) -> (M, M, M::RowVector, M::RowVector) { if x.shape().0 != y.len() { panic!( @@ -38,155 +44,80 @@ pub fn train_test_split>( } let n = y.len(); - let m = x.shape().1; - let mut rng = rand::thread_rng(); - let mut n_test = 0; - let mut index = vec![false; n]; + let n_test = ((n as f32) * test_size) as usize; - for index_i in index.iter_mut().take(n) { - let p_test: f32 = rng.gen(); - if p_test <= test_size { - *index_i = true; - n_test += 1; - } + if n_test < 1 { + panic!("number of sample is too small {}", n); } - let n_train = n - n_test; + let mut indices: Vec = (0..n).collect(); - let mut x_train = M::zeros(n_train, m); - let mut x_test = M::zeros(n_test, m); - let mut y_train = M::RowVector::zeros(n_train); - let mut y_test = M::RowVector::zeros(n_test); - - let mut r_train = 0; - let mut r_test = 0; - - for (r, index_r) in index.iter().enumerate().take(n) { - if *index_r { - //sample belongs to test - for c in 0..m { - x_test.set(r_test, c, x.get(r, c)); - y_test.set(r_test, y.get(r)); - } - r_test += 1; - } else { - for c in 0..m { - x_train.set(r_train, c, x.get(r, c)); - y_train.set(r_train, y.get(r)); - } - r_train += 1; - } + if shuffle { + indices.shuffle(&mut thread_rng()); } + let x_train = x.take(&indices[n_test..n], 0); + let x_test = x.take(&indices[0..n_test], 0); + let y_train = y.take(&indices[n_test..n]); + let y_test = y.take(&indices[0..n_test]); + (x_train, x_test, y_train, y_test) } -/// -/// KFold Cross-Validation -/// -pub trait BaseKFold { - /// Returns integer indices corresponding to test sets - fn test_indices>(&self, x: &M) -> Vec>; - - /// Returns masksk corresponding to test sets - fn test_masks>(&self, x: &M) -> Vec>; - - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +#[derive(Clone, Debug)] +pub struct CrossValidationResult { + pub test_score: Vec, + pub train_score: Vec, } -/// -/// An implementation of KFold -/// -pub struct KFold { - n_splits: usize, // cannot exceed std::usize::MAX - shuffle: bool, - // TODO: to be implemented later - // random_state: i32, -} +impl CrossValidationResult { + pub fn mean_test_score(&self) -> T { + self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() + } -impl Default for KFold { - fn default() -> KFold { - KFold { - n_splits: 3_usize, - shuffle: true, - } + pub fn mean_train_score(&self) -> T { + self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } -/// -/// Abstract class for all KFold functionalities -/// -impl BaseKFold for KFold { - fn test_indices>(&self, x: &M) -> Vec> { - // number of samples (rows) in the matrix - let n_samples: usize = x.shape().0; +pub fn cross_validate( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K, + score: S, +) -> Result, Failed> +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result, + S: Fn(&M::RowVector, &M::RowVector) -> T, +{ + let k = cv.n_splits(); + let mut test_score = Vec::with_capacity(k); + let mut train_score = Vec::with_capacity(k); - // initialise indices - let mut indices: Vec = (0..n_samples).collect(); - if self.shuffle { - indices.shuffle(&mut thread_rng()); - } - // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. - let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + for (test_idx, train_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + let test_y = y.take(&test_idx); - // increment by one if odd - for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { - *fold_size += 1; - } + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; - // generate the right array of arrays for test indices - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - let mut current: usize = 0; - for fold_size in fold_sizes.drain(..) { - let stop = current + fold_size; - return_values.push(indices[current..stop].to_vec()); - current = stop - } - - return_values + train_score.push(score(&train_y, &estimator.predict(&train_x)?)); + test_score.push(score(&test_y, &estimator.predict(&test_x)?)); } - fn test_masks>(&self, x: &M) -> Vec> { - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - for test_index in self.test_indices(x).drain(..) { - // init mask - let mut test_mask = vec![false; x.shape().0]; - // set mask's indices to true according to test indices - for i in test_index { - test_mask[i] = true; // can be implemented with map() - } - return_values.push(test_mask); - } - return_values - } - - fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { - let n_samples: usize = x.shape().0; - let indices: Vec = (0..n_samples).collect(); - - let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs - - for test_index in self.test_masks(x).drain(..) { - let train_index = indices - .clone() - .iter() - .enumerate() - .filter(|&(idx, _)| !test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter train indices out according to mask - let test_index = indices - .iter() - .enumerate() - .filter(|&(idx, _)| test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter tests indices out according to mask - return_values.push((train_index, test_index)) - } - return_values - } + Ok(CrossValidationResult { + test_score, + train_score, + }) } #[cfg(test)] @@ -194,14 +125,17 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + use crate::metrics::{accuracy, mean_absolute_error}; + use crate::model_selection::kfold::KFold; + use crate::neighbors::knn_regressor::KNNRegressor; #[test] fn run_train_test_split() { - let n = 100; - let x: DenseMatrix = DenseMatrix::rand(100, 3); - let y = vec![0f64; 100]; + let n = 123; + let x: DenseMatrix = DenseMatrix::rand(n, 3); + let y = vec![0f64; n]; - let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2); + let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); assert!( x_train.shape().0 > (n as f64 * 0.65) as usize @@ -215,126 +149,195 @@ mod tests { assert_eq!(x_test.shape().0, y_test.len()); } - #[test] - fn run_kfold_return_test_indices_simple() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(33, 100); - let test_indices = k.test_indices(&x); + #[derive(Clone)] + struct NoParameters {} - assert_eq!(test_indices[0], (0..11).collect::>()); - assert_eq!(test_indices[1], (11..22).collect::>()); - assert_eq!(test_indices[2], (22..33).collect::>()); + #[test] + fn test_cross_validate_biased() { + struct BiasedEstimator {} + + impl BiasedEstimator { + fn fit>( + _: &M, + _: &M::RowVector, + _: NoParameters, + ) -> Result { + Ok(BiasedEstimator {}) + } + } + + impl> Predictor for BiasedEstimator { + fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + Ok(M::RowVector::zeros(n)) + } + } + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = + cross_validate(BiasedEstimator::fit, &x, &y, NoParameters {}, cv, &accuracy).unwrap(); + + assert_eq!(0.4, results.mean_test_score()); + assert_eq!(0.4, results.mean_train_score()); } #[test] - fn run_kfold_return_test_indices_odd() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(34, 100); - let test_indices = k.test_indices(&x); + fn test_cross_validate_knn() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - assert_eq!(test_indices[0], (0..12).collect::>()); - assert_eq!(test_indices[1], (12..23).collect::>()); - assert_eq!(test_indices[2], (23..34).collect::>()); + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = cross_validate( + KNNRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + assert!(results.mean_test_score() < 15.0); + assert!(results.mean_train_score() < results.mean_test_score()); } + use crate::tree::decision_tree_regressor::*; + #[test] - fn run_kfold_return_test_mask_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let test_masks = k.test_masks(&x); + fn test_some_regressor() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - for t in &test_masks[0][0..11] { - // TODO: this can be prob done better - assert_eq!(*t, true) - } - for t in &test_masks[0][11..22] { - assert_eq!(*t, false) - } + let cv = KFold::default().with_n_splits(2); - for t in &test_masks[1][0..11] { - assert_eq!(*t, false) - } - for t in &test_masks[1][11..22] { - assert_eq!(*t, true) - } + let results = cross_validate( + DecisionTreeRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } - #[test] - fn run_kfold_return_split_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let train_test_splits = k.split(&x); - - assert_eq!(train_test_splits[0].1, (0..11).collect::>()); - assert_eq!(train_test_splits[0].0, (11..22).collect::>()); - assert_eq!(train_test_splits[1].0, (0..11).collect::>()); - assert_eq!(train_test_splits[1].1, (11..22).collect::>()); - } + use crate::tree::decision_tree_classifier::*; #[test] - fn run_kfold_return_split_simple_shuffle() { - let k = KFold { + fn test_some_classifier() { + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { n_splits: 2, ..KFold::default() }; - let x: DenseMatrix = DenseMatrix::rand(23, 100); - let train_test_splits = k.split(&x); - assert_eq!(train_test_splits[0].1.len(), 12_usize); - assert_eq!(train_test_splits[0].0.len(), 11_usize); - assert_eq!(train_test_splits[1].0.len(), 12_usize); - assert_eq!(train_test_splits[1].1.len(), 11_usize); - } + let results = + cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); - #[test] - fn numpy_parity_test() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test, expected_test); - assert_eq!(train, expected_train); - } - } - - #[test] - fn numpy_parity_test_shuffle() { - let k = KFold { - n_splits: 3, - ..KFold::default() - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test.len(), expected_test.len()); - assert_eq!(train.len(), expected_train.len()); - } + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } } diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index dd34ae9..fe299f3 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -200,6 +201,12 @@ pub struct BernoulliNB> { binarize: Option, } +impl> Predictor for BernoulliNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> BernoulliNB { /// Fits BernoulliNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c4626ef..ce526ce 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,6 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -246,6 +247,12 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> Predictor for CategoricalNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> CategoricalNB { /// Fits CategoricalNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c5c1fb2..01dacd7 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,6 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -181,6 +182,12 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> Predictor for GaussianNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> GaussianNB { /// Fits GaussianNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index c9ac86b..84d3fd1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -187,6 +188,12 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> Predictor for MultinomialNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> MultinomialNB { /// Fits MultinomialNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index f940211..8b4db1b 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -25,31 +25,40 @@ //! &[9., 10.]]); //! let y = vec![2., 2., 2., 3., 3.]; //your class labels //! -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold a vector with estimates of class labels //! +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNClassifierParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNClassifierParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Classifier @@ -62,12 +71,39 @@ pub struct KNNClassifier, T>> { k: usize, } -impl Default for KNNClassifierParameters { +impl, T>> KNNClassifierParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNClassifierParameters { fn default() -> Self { KNNClassifierParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -95,19 +131,23 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> Predictor + for KNNClassifier +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNClassifier { /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with target values (classes) of length N - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with target values (classes) of length N /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNClassifierParameters, + parameters: KNNClassifierParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -142,7 +182,7 @@ impl, T>> KNNClassifier { classes, y: yi, k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -187,14 +227,13 @@ impl, T>> KNNClassifier { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::math::distance::Distances; #[test] fn knn_fit_predict() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); assert_eq!(y.to_vec(), y_hat); @@ -207,12 +246,10 @@ mod tests { let knn = KNNClassifier::fit( &x, &y, - Distances::euclidian(), - KNNClassifierParameters { - k: 5, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNClassifierParameters::default() + .with_k(5) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap(); @@ -225,7 +262,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index b7c0f2d..a97fdea 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -27,31 +27,41 @@ //! &[5., 5.]]); //! let y = vec![1., 2., 3., 4., 5.]; //your target values //! -//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold predicted value //! //! +use std::marker::PhantomData; + use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNRegressorParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNRegressorParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Regressor @@ -63,12 +73,39 @@ pub struct KNNRegressor, T>> { k: usize, } -impl Default for KNNRegressorParameters { +impl, T>> KNNRegressorParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNRegressorParameters { fn default() -> Self { KNNRegressorParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -88,19 +125,23 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> Predictor + for KNNRegressor +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNRegressor { /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with real values - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with real values /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNRegressorParameters, + parameters: KNNRegressorParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -126,7 +167,7 @@ impl, T>> KNNRegressor { Ok(KNNRegressor { y: y.to_vec(), k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -176,12 +217,11 @@ mod tests { let knn = KNNRegressor::fit( &x, &y, - Distances::euclidian(), - KNNRegressorParameters { - k: 3, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNRegressorParameters::default() + .with_k(3) + .with_distance(Distances::euclidian()) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&x).unwrap(); @@ -197,7 +237,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y: Vec = vec![1., 2., 3., 4., 5.]; let y_exp = vec![2., 2., 3., 4., 4.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); for i in 0..y_hat.len() { @@ -211,7 +251,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![1., 2., 3., 4., 5.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index be1ad4d..85ea6b8 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -48,7 +48,7 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1f563c1..1e013d2 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -93,16 +93,18 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -113,6 +115,7 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9e166d5..cbe97f7 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -57,13 +57,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, -//! Kernels::linear(), -//! SVCParameters { -//! epoch: 2, -//! c: 200.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -84,22 +78,26 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVC Parameters -pub struct SVCParameters { - /// Number of epochs +pub struct SVCParameters, K: Kernel> { + /// Number of epochs. pub epoch: usize, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -136,7 +134,7 @@ struct Cache<'a, T: RealNumber, M: Matrix, K: Kernel> { struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { x: &'a M, y: &'a M::RowVector, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, svmin: usize, svmax: usize, gmin: T, @@ -147,27 +145,61 @@ struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { recalculate_minmax_grad: bool, } -impl Default for SVCParameters { +impl, K: Kernel> SVCParameters { + /// Number of epochs. + pub fn with_epoch(mut self, epoch: usize) -> Self { + self.epoch = epoch; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVCParameters { + SVCParameters { + epoch: self.epoch, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVCParameters { fn default() -> Self { SVCParameters { epoch: 2, c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVC { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVC { /// Fits SVC to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - class labels - /// * `kernel` - the kernel function /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, y: &M::RowVector, - kernel: K, - parameters: SVCParameters, + parameters: SVCParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -198,13 +230,13 @@ impl, K: Kernel> SVC { } } - let optimizer = Optimizer::new(x, &y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, &y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { classes, - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -321,7 +353,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &'a M, y: &'a M::RowVector, kernel: &'a K, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -711,17 +743,10 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::linear(), - SVCParameters { - epoch: 2, - c: 200.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); - - println!("{:?}", y_hat); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -759,12 +784,7 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::rbf(0.7), - SVCParameters { - epoch: 2, - c: 1.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -801,7 +821,7 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVC, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 5d007d7..25c7ff6 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -49,13 +49,7 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let svr = SVR::fit(&x, &y, -//! LinearKernel {}, -//! SVRParameters { -//! eps: 2.0, -//! c: 10.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -72,25 +66,30 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVR Parameters -pub struct SVRParameters { - /// Epsilon in the epsilon-SVR model +pub struct SVRParameters, K: Kernel> { + /// Epsilon in the epsilon-SVR model. pub eps: T, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -135,16 +134,52 @@ struct Cache { data: Vec>>>, } -impl Default for SVRParameters { +impl, K: Kernel> SVRParameters { + /// Epsilon in the epsilon-SVR model. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVRParameters { + SVRParameters { + eps: self.eps, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVRParameters { fn default() -> Self { SVRParameters { eps: T::from_f64(0.1).unwrap(), c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVR { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVR { /// Fits SVR to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. @@ -153,9 +188,8 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, - kernel: K, - parameters: SVRParameters, + y: &M::RowVector, + parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -165,12 +199,12 @@ impl, K: Kernel> SVR { )); } - let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.smo(); Ok(SVR { - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -243,7 +277,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &M, y: &M::RowVector, kernel: &'a K, - parameters: &SVRParameters, + parameters: &SVRParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -513,12 +547,7 @@ mod tests { let y_hat = SVR::fit( &x, &y, - LinearKernel {}, - SVRParameters { - eps: 2.0, - c: 10.0, - tol: 1e-3, - }, + SVRParameters::default().with_eps(2.0).with_c(10.0), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -552,7 +581,7 @@ mod tests { 114.2, 115.7, 116.9, ]; - let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + let svr = SVR::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVR, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 371bc4e..1845d5e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,11 +71,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -269,6 +270,12 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> Predictor for DecisionTreeClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeClassifier { /// Build a decision tree classifier from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 5e80b4c..492f0a1 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,11 +66,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -189,6 +190,12 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> Predictor for DecisionTreeRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeRegressor { /// Build a decision tree regressor from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.