From a2be9e117f96e173c9aaaa0be0e6f79cdac719ac Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 15:41:53 -0800 Subject: [PATCH 1/7] feat: + cross_validate, trait Predictor, refactoring --- src/algorithm/neighbour/cover_tree.rs | 3 +- src/algorithm/neighbour/linear_search.rs | 2 + src/base.rs | 10 + src/ensemble/random_forest_classifier.rs | 9 +- src/ensemble/random_forest_regressor.rs | 7 + src/lib.rs | 3 +- src/linalg/mod.rs | 72 ++++ src/linalg/ndarray_bindings.rs | 4 +- src/linear/elastic_net.rs | 9 +- src/linear/lasso.rs | 9 +- src/linear/linear_regression.rs | 11 +- src/linear/logistic_regression.rs | 34 +- src/linear/ridge_regression.rs | 11 +- src/math/distance/euclidian.rs | 2 +- src/math/distance/hamming.rs | 2 +- src/math/distance/mahalanobis.rs | 2 +- src/math/distance/manhattan.rs | 2 +- src/math/distance/minkowski.rs | 2 +- src/math/distance/mod.rs | 2 +- src/metrics/mod.rs | 2 +- src/model_selection/kfold.rs | 286 ++++++++++++++ src/model_selection/mod.rs | 473 ++++++++++++----------- src/naive_bayes/bernoulli.rs | 7 + src/naive_bayes/categorical.rs | 7 + src/naive_bayes/gaussian.rs | 7 + src/naive_bayes/multinomial.rs | 7 + src/neighbors/knn_classifier.rs | 79 +++- src/neighbors/knn_regressor.rs | 80 +++- src/neighbors/mod.rs | 2 +- src/svm/mod.rs | 5 +- src/svm/svc.rs | 94 +++-- src/svm/svr.rs | 83 ++-- src/tree/decision_tree_classifier.rs | 9 +- src/tree/decision_tree_regressor.rs | 9 +- 34 files changed, 977 insertions(+), 369 deletions(-) create mode 100644 src/base.rs create mode 100644 src/model_selection/kfold.rs diff --git a/src/algorithm/neighbour/cover_tree.rs b/src/algorithm/neighbour/cover_tree.rs index 2fe7792..d271ed6 100644 --- a/src/algorithm/neighbour/cover_tree.rs +++ b/src/algorithm/neighbour/cover_tree.rs @@ -6,6 +6,7 @@ //! use smartcore::algorithm::neighbour::cover_tree::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -453,7 +454,7 @@ mod tests { use super::*; use crate::math::distance::Distances; - #[derive(Debug, Serialize, Deserialize)] + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/algorithm/neighbour/linear_search.rs b/src/algorithm/neighbour/linear_search.rs index d09f2ed..45fbd6f 100644 --- a/src/algorithm/neighbour/linear_search.rs +++ b/src/algorithm/neighbour/linear_search.rs @@ -5,6 +5,7 @@ //! use smartcore::algorithm::neighbour::linear_search::*; //! use smartcore::math::distance::Distance; //! +//! #[derive(Clone)] //! struct SimpleDistance {} // Our distance function //! //! impl Distance for SimpleDistance { @@ -137,6 +138,7 @@ mod tests { use super::*; use crate::math::distance::Distances; + #[derive(Debug, Serialize, Deserialize, Clone)] struct SimpleDistance {} impl Distance for SimpleDistance { diff --git a/src/base.rs b/src/base.rs new file mode 100644 index 0000000..a2d4468 --- /dev/null +++ b/src/base.rs @@ -0,0 +1,10 @@ +//! # Common Interfaces and methods +//! +//! This module consolidates interfaces and uniform basic API that is used elsewhere in the code. + +use crate::error::Failed; + +/// Implements method predict that offers a way to estimate target value from new data +pub trait Predictor { + fn predict(&self, x: &X) -> Result; +} diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index 7229d92..a742d90 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -9,7 +9,7 @@ //! //! ``` //! use smartcore::linalg::naive::dense_matrix::*; -//! use smartcore::ensemble::random_forest_classifier::*; +//! use smartcore::ensemble::random_forest_classifier::RandomForestClassifier; //! //! // Iris dataset //! let x = DenseMatrix::from_2d_array(&[ @@ -51,6 +51,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -117,6 +118,12 @@ impl Default for RandomForestClassifierParameters { } } +impl> Predictor for RandomForestClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestClassifier { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 36fa096..52b39f9 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -49,6 +49,7 @@ use std::fmt::Debug; use rand::Rng; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -106,6 +107,12 @@ impl PartialEq for RandomForestRegressor { } } +impl> Predictor for RandomForestRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl RandomForestRegressor { /// Build a forest of trees from the training set. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/lib.rs b/src/lib.rs index 9290c86..a1608c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,7 @@ //! let y = vec![2., 2., 2., 3., 3.]; //! //! // Train classifier -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! //! // Predict classes //! let y_hat = knn.predict(&x).unwrap(); @@ -71,6 +71,7 @@ /// Various algorithms and helper methods that are used elsewhere in SmartCore pub mod algorithm; +pub(crate) mod base; /// Algorithms for clustering of unlabeled data pub mod cluster; /// Various datasets diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index c768cbf..5b49942 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -274,6 +274,19 @@ pub trait BaseVector: Clone + Debug { /// Copies content of `other` vector. fn copy_from(&mut self, other: &Self); + + /// Take elements from an array. + fn take(&self, index: &[usize]) -> Self { + let n = index.len(); + + let mut result = Self::zeros(n); + + for i in 0..n { + result.set(i, self.get(index[i])); + } + + result + } } /// Generic matrix type. @@ -611,6 +624,32 @@ pub trait BaseMatrix: Clone + Debug { /// Calculates the covariance matrix fn cov(&self) -> Self; + + /// Take elements from an array along an axis. + fn take(&self, index: &[usize], axis: u8) -> Self { + let (n, p) = self.shape(); + + let k = match axis { + 0 => p, + _ => n, + }; + + let mut result = match axis { + 0 => Self::zeros(index.len(), p), + _ => Self::zeros(n, index.len()), + }; + + for i in 0..index.len() { + for j in 0..k { + match axis { + 0 => result.set(i, j, self.get(index[i], j)), + _ => result.set(j, i, self.get(j, index[i])), + }; + } + } + + result + } } /// Generic matrix with additional mixins like various factorization methods. @@ -662,6 +701,8 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { #[cfg(test)] mod tests { + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseMatrix; use crate::linalg::BaseVector; #[test] @@ -684,4 +725,35 @@ mod tests { assert!((m.var() - 1.25f64).abs() < std::f64::EPSILON); } + + #[test] + fn vec_take() { + let m = vec![1., 2., 3., 4., 5.]; + + assert_eq!(m.take(&vec!(0, 0, 4, 4)), vec![1., 1., 5., 5.]); + } + + #[test] + fn take() { + let m = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[3.0, 4.0], + &[5.0, 6.0], + &[7.0, 8.0], + &[9.0, 10.0], + ]); + + let expected_0 = DenseMatrix::from_2d_array(&[&[3.0, 4.0], &[3.0, 4.0], &[7.0, 8.0]]); + + let expected_1 = DenseMatrix::from_2d_array(&[ + &[2.0, 1.0], + &[4.0, 3.0], + &[6.0, 5.0], + &[8.0, 7.0], + &[10.0, 9.0], + ]); + + assert_eq!(m.take(&vec!(1, 1, 3), 0), expected_0); + assert_eq!(m.take(&vec!(1, 0), 1), expected_1); + } } diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 085fd5d..6ed40c8 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -36,7 +36,7 @@ //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. //! ]); //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = lr.predict(&x).unwrap(); //! ``` use std::iter::Sum; @@ -917,7 +917,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]); - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index c01f3c7..b386290 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -58,6 +58,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -66,7 +67,7 @@ use crate::math::num::RealNumber; use crate::linear::lasso_optimizer::InteriorPointOptimizer; /// Elastic net parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct ElasticNetParameters { /// Regularization parameter. pub alpha: T, @@ -108,6 +109,12 @@ impl> PartialEq for ElasticNet { } } +impl> Predictor for ElasticNet { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> ElasticNet { /// Fits elastic net regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 7395bdc..0dab3e5 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -26,6 +26,7 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -33,7 +34,7 @@ use crate::linear::lasso_optimizer::InteriorPointOptimizer; use crate::math::num::RealNumber; /// Lasso regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LassoParameters { /// Controls the strength of the penalty to the loss function. pub alpha: T, @@ -71,6 +72,12 @@ impl> PartialEq for Lasso { } } +impl> Predictor for Lasso { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> Lasso { /// Fits Lasso regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index d01b817..c7bd872 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -64,11 +64,12 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. QR is more efficient but SVD is more stable. pub enum LinearRegressionSolverName { /// QR decomposition, see [QR](../../linalg/qr/index.html) @@ -78,7 +79,7 @@ pub enum LinearRegressionSolverName { } /// Linear Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: LinearRegressionSolverName, @@ -107,6 +108,12 @@ impl> PartialEq for LinearRegression { } } +impl> Predictor for LinearRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LinearRegression { /// Fits Linear Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index 7b7cab6..b85bbe8 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -40,7 +40,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` @@ -58,6 +58,7 @@ use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; @@ -66,6 +67,11 @@ use crate::optimization::first_order::{FirstOrderOptimizer, OptimizerResult}; use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; +/// Logistic Regression parameters +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct LogisticRegressionParameters { +} + /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] pub struct LogisticRegression> { @@ -97,6 +103,13 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { phantom: PhantomData<&'a T>, } +impl Default for LogisticRegressionParameters { + fn default() -> Self { + LogisticRegressionParameters { + } + } +} + impl> PartialEq for LogisticRegression { fn eq(&self, other: &Self) -> bool { if self.num_classes != other.num_classes @@ -207,11 +220,18 @@ impl<'a, T: RealNumber, M: Matrix> ObjectiveFunction } } +impl> Predictor for LogisticRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> LogisticRegression { /// Fits Logistic Regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values - pub fn fit(x: &M, y: &M::RowVector) -> Result, Failed> { + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); @@ -461,7 +481,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); assert_eq!(lr.coefficients().shape(), (3, 2)); assert_eq!(lr.intercept().shape(), (3, 1)); @@ -484,7 +504,7 @@ mod tests { let x = DenseMatrix::from_vec(15, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -498,7 +518,7 @@ mod tests { let x = DenseMatrix::from_vec(20, 4, &blobs.data); let y = blobs.target; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); @@ -526,7 +546,7 @@ mod tests { ]); let y: Vec = vec![0., 0., 1., 1., 2., 1., 1., 0., 0., 2., 1., 1., 0., 0., 1.]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let deserialized_lr: LogisticRegression> = serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); @@ -562,7 +582,7 @@ mod tests { 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let lr = LogisticRegression::fit(&x, &y).unwrap(); + let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); let y_hat = lr.predict(&x).unwrap(); diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 98bc639..2b5a898 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -63,12 +63,13 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. pub enum RidgeRegressionSolverName { /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) @@ -78,7 +79,7 @@ pub enum RidgeRegressionSolverName { } /// Ridge Regression parameters -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RidgeRegressionParameters { /// Solver to use for estimation of regression coefficients. pub solver: RidgeRegressionSolverName, @@ -114,6 +115,12 @@ impl> PartialEq for RidgeRegression { } } +impl> Predictor for RidgeRegression { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> RidgeRegression { /// Fits ridge regression to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/math/distance/euclidian.rs b/src/math/distance/euclidian.rs index e292f9c..9034727 100644 --- a/src/math/distance/euclidian.rs +++ b/src/math/distance/euclidian.rs @@ -25,7 +25,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Euclidean distance is a measure of the true straight line distance between two points in Euclidean n-space. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Euclidian {} impl Euclidian { diff --git a/src/math/distance/hamming.rs b/src/math/distance/hamming.rs index 4028259..129fe16 100644 --- a/src/math/distance/hamming.rs +++ b/src/math/distance/hamming.rs @@ -26,7 +26,7 @@ use crate::math::num::RealNumber; use super::Distance; /// While comparing two integer-valued vectors of equal length, Hamming distance is the number of bit positions in which the two bits are different -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Hamming {} impl Distance, F> for Hamming { diff --git a/src/math/distance/mahalanobis.rs b/src/math/distance/mahalanobis.rs index fd320c3..84aa947 100644 --- a/src/math/distance/mahalanobis.rs +++ b/src/math/distance/mahalanobis.rs @@ -52,7 +52,7 @@ use super::Distance; use crate::linalg::Matrix; /// Mahalanobis distance. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Mahalanobis> { /// covariance matrix of the dataset pub sigma: M, diff --git a/src/math/distance/manhattan.rs b/src/math/distance/manhattan.rs index 66125a5..9a69184 100644 --- a/src/math/distance/manhattan.rs +++ b/src/math/distance/manhattan.rs @@ -24,7 +24,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Manhattan distance -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Manhattan {} impl Distance, T> for Manhattan { diff --git a/src/math/distance/minkowski.rs b/src/math/distance/minkowski.rs index b7c5691..c5dd85d 100644 --- a/src/math/distance/minkowski.rs +++ b/src/math/distance/minkowski.rs @@ -28,7 +28,7 @@ use crate::math::num::RealNumber; use super::Distance; /// Defines the Minkowski distance of order `p` -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Minkowski { /// order, integer pub p: u16, diff --git a/src/math/distance/mod.rs b/src/math/distance/mod.rs index 696b5ff..9bfbd6b 100644 --- a/src/math/distance/mod.rs +++ b/src/math/distance/mod.rs @@ -28,7 +28,7 @@ use crate::linalg::Matrix; use crate::math::num::RealNumber; /// Distance metric, a function that calculates distance between two points -pub trait Distance { +pub trait Distance: Clone { /// Calculates distance between _a_ and _b_ fn distance(&self, a: &T, b: &T) -> F; } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index f49300d..42b3994 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -42,7 +42,7 @@ //! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., //! ]; //! -//! let lr = LogisticRegression::fit(&x, &y).unwrap(); +//! let lr = LogisticRegression::fit(&x, &y, Default::default()).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs new file mode 100644 index 0000000..0fbe224 --- /dev/null +++ b/src/model_selection/kfold.rs @@ -0,0 +1,286 @@ +//! # KFold +//! +//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), +//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. +//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. +//! Underfitted is bad because the model is undetrained and does not fit the training data well. +//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for +//! your data. +//! +//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. + +use crate::linalg::Matrix; +use crate::math::num::RealNumber; +use rand::seq::SliceRandom; +use rand::thread_rng; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} + +/// K-Folds cross-validator +pub struct KFold { + /// Number of folds. Must be at least 2. + pub n_splits: usize, // cannot exceed std::usize::MAX + /// Whether to shuffle the data before splitting into batches + pub shuffle: bool, +} + +impl KFold { + fn test_indices>(&self, x: &M) -> Vec> { + // number of samples (rows) in the matrix + let n_samples: usize = x.shape().0; + + // initialise indices + let mut indices: Vec = (0..n_samples).collect(); + if self.shuffle { + indices.shuffle(&mut thread_rng()); + } + // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. + let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + + // increment by one if odd + for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { + *fold_size += 1; + } + + // generate the right array of arrays for test indices + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + let mut current: usize = 0; + for fold_size in fold_sizes.drain(..) { + let stop = current + fold_size; + return_values.push(indices[current..stop].to_vec()); + current = stop + } + + return_values + } + + fn test_masks>(&self, x: &M) -> Vec> { + let mut return_values: Vec> = Vec::with_capacity(self.n_splits); + for test_index in self.test_indices(x).drain(..) { + // init mask + let mut test_mask = vec![false; x.shape().0]; + // set mask's indices to true according to test indices + for i in test_index { + test_mask[i] = true; // can be implemented with map() + } + return_values.push(test_mask); + } + return_values + } +} + +impl Default for KFold { + fn default() -> KFold { + KFold { + n_splits: 3, + shuffle: true, + } + } +} + +impl KFold { + /// Number of folds. Must be at least 2. + pub fn with_n_splits(mut self, n_splits: usize) -> Self { + self.n_splits = n_splits; + self + } + /// Whether to shuffle the data before splitting into batches + pub fn with_shuffle(mut self, shuffle: bool) -> Self { + self.shuffle = shuffle; + self + } +} + +/// An iterator over indices that split data into training and test set. +pub struct BaseKFoldIter { + indices: Vec, + test_indices: Vec>, +} + +impl Iterator for BaseKFoldIter { + type Item = (Vec, Vec); + + fn next(&mut self) -> Option<(Vec, Vec)> { + self.test_indices.pop().map(|test_index| { + let train_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| !test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter train indices out according to mask + let test_index = self + .indices + .iter() + .enumerate() + .filter(|&(idx, _)| test_index[idx]) + .map(|(idx, _)| idx) + .collect::>(); // filter tests indices out according to mask + + (train_index, test_index) + }) + } +} + +/// Abstract class for all KFold functionalities +impl BaseKFold for KFold { + type Output = BaseKFoldIter; + + fn n_splits(&self) -> usize { + self.n_splits + } + + fn split>(&self, x: &M) -> Self::Output { + if self.n_splits < 2 { + panic!("Number of splits is too small: {}", self.n_splits); + } + let n_samples: usize = x.shape().0; + let indices: Vec = (0..n_samples).collect(); + let mut test_indices = self.test_masks(x); + test_indices.reverse(); + + BaseKFoldIter { + indices, + test_indices, + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn run_kfold_return_test_indices_simple() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(33, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..11).collect::>()); + assert_eq!(test_indices[1], (11..22).collect::>()); + assert_eq!(test_indices[2], (22..33).collect::>()); + } + + #[test] + fn run_kfold_return_test_indices_odd() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(34, 100); + let test_indices = k.test_indices(&x); + + assert_eq!(test_indices[0], (0..12).collect::>()); + assert_eq!(test_indices[1], (12..23).collect::>()); + assert_eq!(test_indices[2], (23..34).collect::>()); + } + + #[test] + fn run_kfold_return_test_mask_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let test_masks = k.test_masks(&x); + + for t in &test_masks[0][0..11] { + // TODO: this can be prob done better + assert_eq!(*t, true) + } + for t in &test_masks[0][11..22] { + assert_eq!(*t, false) + } + + for t in &test_masks[1][0..11] { + assert_eq!(*t, false) + } + for t in &test_masks[1][11..22] { + assert_eq!(*t, true) + } + } + + #[test] + fn run_kfold_return_split_simple() { + let k = KFold { + n_splits: 2, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(22, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1, (0..11).collect::>()); + assert_eq!(train_test_splits[0].0, (11..22).collect::>()); + assert_eq!(train_test_splits[1].0, (0..11).collect::>()); + assert_eq!(train_test_splits[1].1, (11..22).collect::>()); + } + + #[test] + fn run_kfold_return_split_simple_shuffle() { + let k = KFold { + n_splits: 2, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(23, 100); + let train_test_splits: Vec<(Vec, Vec)> = k.split(&x).collect(); + + assert_eq!(train_test_splits[0].1.len(), 12_usize); + assert_eq!(train_test_splits[0].0.len(), 11_usize); + assert_eq!(train_test_splits[1].0.len(), 12_usize); + assert_eq!(train_test_splits[1].1.len(), 11_usize); + } + + #[test] + fn numpy_parity_test() { + let k = KFold { + n_splits: 3, + shuffle: false, + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test, expected_test); + assert_eq!(train, expected_train); + } + } + + #[test] + fn numpy_parity_test_shuffle() { + let k = KFold { + n_splits: 3, + ..KFold::default() + }; + let x: DenseMatrix = DenseMatrix::rand(10, 4); + let expected: Vec<(Vec, Vec)> = vec![ + (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), + (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), + (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), + ]; + for ((train, test), (expected_train, expected_test)) in + k.split(&x).into_iter().zip(expected) + { + assert_eq!(test.len(), expected_test.len()); + assert_eq!(train.len(), expected_train.len()); + } + } +} diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index bc0f9b8..64527b3 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -9,21 +9,27 @@ //! //! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +use crate::base::Predictor; +use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -use rand::Rng; + +pub mod kfold; /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. /// * `y` - target values, should be of size _M_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. +/// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( x: &M, y: &M::RowVector, test_size: f32, + shuffle: bool, ) -> (M, M, M::RowVector, M::RowVector) { if x.shape().0 != y.len() { panic!( @@ -38,155 +44,80 @@ pub fn train_test_split>( } let n = y.len(); - let m = x.shape().1; - let mut rng = rand::thread_rng(); - let mut n_test = 0; - let mut index = vec![false; n]; + let n_test = ((n as f32) * test_size) as usize; - for index_i in index.iter_mut().take(n) { - let p_test: f32 = rng.gen(); - if p_test <= test_size { - *index_i = true; - n_test += 1; - } + if n_test < 1 { + panic!("number of sample is too small {}", n); } - let n_train = n - n_test; + let mut indices: Vec = (0..n).collect(); - let mut x_train = M::zeros(n_train, m); - let mut x_test = M::zeros(n_test, m); - let mut y_train = M::RowVector::zeros(n_train); - let mut y_test = M::RowVector::zeros(n_test); - - let mut r_train = 0; - let mut r_test = 0; - - for (r, index_r) in index.iter().enumerate().take(n) { - if *index_r { - //sample belongs to test - for c in 0..m { - x_test.set(r_test, c, x.get(r, c)); - y_test.set(r_test, y.get(r)); - } - r_test += 1; - } else { - for c in 0..m { - x_train.set(r_train, c, x.get(r, c)); - y_train.set(r_train, y.get(r)); - } - r_train += 1; - } + if shuffle { + indices.shuffle(&mut thread_rng()); } + let x_train = x.take(&indices[n_test..n], 0); + let x_test = x.take(&indices[0..n_test], 0); + let y_train = y.take(&indices[n_test..n]); + let y_test = y.take(&indices[0..n_test]); + (x_train, x_test, y_train, y_test) } -/// -/// KFold Cross-Validation -/// -pub trait BaseKFold { - /// Returns integer indices corresponding to test sets - fn test_indices>(&self, x: &M) -> Vec>; - - /// Returns masksk corresponding to test sets - fn test_masks>(&self, x: &M) -> Vec>; - - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Vec<(Vec, Vec)>; +#[derive(Clone, Debug)] +pub struct CrossValidationResult { + pub test_score: Vec, + pub train_score: Vec, } -/// -/// An implementation of KFold -/// -pub struct KFold { - n_splits: usize, // cannot exceed std::usize::MAX - shuffle: bool, - // TODO: to be implemented later - // random_state: i32, -} +impl CrossValidationResult { + pub fn mean_test_score(&self) -> T { + self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() + } -impl Default for KFold { - fn default() -> KFold { - KFold { - n_splits: 3_usize, - shuffle: true, - } + pub fn mean_train_score(&self) -> T { + self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } -/// -/// Abstract class for all KFold functionalities -/// -impl BaseKFold for KFold { - fn test_indices>(&self, x: &M) -> Vec> { - // number of samples (rows) in the matrix - let n_samples: usize = x.shape().0; +pub fn cross_validate( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K, + score: S, +) -> Result, Failed> +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result, + S: Fn(&M::RowVector, &M::RowVector) -> T, +{ + let k = cv.n_splits(); + let mut test_score = Vec::with_capacity(k); + let mut train_score = Vec::with_capacity(k); - // initialise indices - let mut indices: Vec = (0..n_samples).collect(); - if self.shuffle { - indices.shuffle(&mut thread_rng()); - } - // return a new array of given shape n_split, filled with each element of n_samples divided by n_splits. - let mut fold_sizes = vec![n_samples / self.n_splits; self.n_splits]; + for (test_idx, train_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + let test_y = y.take(&test_idx); - // increment by one if odd - for fold_size in fold_sizes.iter_mut().take(n_samples % self.n_splits) { - *fold_size += 1; - } + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; - // generate the right array of arrays for test indices - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - let mut current: usize = 0; - for fold_size in fold_sizes.drain(..) { - let stop = current + fold_size; - return_values.push(indices[current..stop].to_vec()); - current = stop - } - - return_values + train_score.push(score(&train_y, &estimator.predict(&train_x)?)); + test_score.push(score(&test_y, &estimator.predict(&test_x)?)); } - fn test_masks>(&self, x: &M) -> Vec> { - let mut return_values: Vec> = Vec::with_capacity(self.n_splits); - for test_index in self.test_indices(x).drain(..) { - // init mask - let mut test_mask = vec![false; x.shape().0]; - // set mask's indices to true according to test indices - for i in test_index { - test_mask[i] = true; // can be implemented with map() - } - return_values.push(test_mask); - } - return_values - } - - fn split>(&self, x: &M) -> Vec<(Vec, Vec)> { - let n_samples: usize = x.shape().0; - let indices: Vec = (0..n_samples).collect(); - - let mut return_values: Vec<(Vec, Vec)> = Vec::with_capacity(self.n_splits); // TODO: init nested vecs with capacities by getting the length of test_index vecs - - for test_index in self.test_masks(x).drain(..) { - let train_index = indices - .clone() - .iter() - .enumerate() - .filter(|&(idx, _)| !test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter train indices out according to mask - let test_index = indices - .iter() - .enumerate() - .filter(|&(idx, _)| test_index[idx]) - .map(|(idx, _)| idx) - .collect::>(); // filter tests indices out according to mask - return_values.push((train_index, test_index)) - } - return_values - } + Ok(CrossValidationResult { + test_score, + train_score, + }) } #[cfg(test)] @@ -194,14 +125,17 @@ mod tests { use super::*; use crate::linalg::naive::dense_matrix::*; + use crate::metrics::{accuracy, mean_absolute_error}; + use crate::model_selection::kfold::KFold; + use crate::neighbors::knn_regressor::KNNRegressor; #[test] fn run_train_test_split() { - let n = 100; - let x: DenseMatrix = DenseMatrix::rand(100, 3); - let y = vec![0f64; 100]; + let n = 123; + let x: DenseMatrix = DenseMatrix::rand(n, 3); + let y = vec![0f64; n]; - let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2); + let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); assert!( x_train.shape().0 > (n as f64 * 0.65) as usize @@ -215,126 +149,195 @@ mod tests { assert_eq!(x_test.shape().0, y_test.len()); } - #[test] - fn run_kfold_return_test_indices_simple() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(33, 100); - let test_indices = k.test_indices(&x); + #[derive(Clone)] + struct NoParameters {} - assert_eq!(test_indices[0], (0..11).collect::>()); - assert_eq!(test_indices[1], (11..22).collect::>()); - assert_eq!(test_indices[2], (22..33).collect::>()); + #[test] + fn test_cross_validate_biased() { + struct BiasedEstimator {} + + impl BiasedEstimator { + fn fit>( + _: &M, + _: &M::RowVector, + _: NoParameters, + ) -> Result { + Ok(BiasedEstimator {}) + } + } + + impl> Predictor for BiasedEstimator { + fn predict(&self, x: &M) -> Result { + let (n, _) = x.shape(); + Ok(M::RowVector::zeros(n)) + } + } + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = + cross_validate(BiasedEstimator::fit, &x, &y, NoParameters {}, cv, &accuracy).unwrap(); + + assert_eq!(0.4, results.mean_test_score()); + assert_eq!(0.4, results.mean_train_score()); } #[test] - fn run_kfold_return_test_indices_odd() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(34, 100); - let test_indices = k.test_indices(&x); + fn test_cross_validate_knn() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - assert_eq!(test_indices[0], (0..12).collect::>()); - assert_eq!(test_indices[1], (12..23).collect::>()); - assert_eq!(test_indices[2], (23..34).collect::>()); + let cv = KFold { + n_splits: 5, + ..KFold::default() + }; + + let results = cross_validate( + KNNRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + assert!(results.mean_test_score() < 15.0); + assert!(results.mean_train_score() < results.mean_test_score()); } + use crate::tree::decision_tree_regressor::*; + #[test] - fn run_kfold_return_test_mask_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let test_masks = k.test_masks(&x); + fn test_some_regressor() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159., 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165., 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.27, 1952., 63.639], + &[365.385, 187., 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335., 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.18, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.95, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; - for t in &test_masks[0][0..11] { - // TODO: this can be prob done better - assert_eq!(*t, true) - } - for t in &test_masks[0][11..22] { - assert_eq!(*t, false) - } + let cv = KFold::default().with_n_splits(2); - for t in &test_masks[1][0..11] { - assert_eq!(*t, false) - } - for t in &test_masks[1][11..22] { - assert_eq!(*t, true) - } + let results = cross_validate( + DecisionTreeRegressor::fit, + &x, + &y, + Default::default(), + cv, + &mean_absolute_error, + ) + .unwrap(); + + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } - #[test] - fn run_kfold_return_split_simple() { - let k = KFold { - n_splits: 2, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(22, 100); - let train_test_splits = k.split(&x); - - assert_eq!(train_test_splits[0].1, (0..11).collect::>()); - assert_eq!(train_test_splits[0].0, (11..22).collect::>()); - assert_eq!(train_test_splits[1].0, (0..11).collect::>()); - assert_eq!(train_test_splits[1].1, (11..22).collect::>()); - } + use crate::tree::decision_tree_classifier::*; #[test] - fn run_kfold_return_split_simple_shuffle() { - let k = KFold { + fn test_some_classifier() { + + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + let y = vec![ + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + ]; + + let cv = KFold { n_splits: 2, ..KFold::default() }; - let x: DenseMatrix = DenseMatrix::rand(23, 100); - let train_test_splits = k.split(&x); - assert_eq!(train_test_splits[0].1.len(), 12_usize); - assert_eq!(train_test_splits[0].0.len(), 11_usize); - assert_eq!(train_test_splits[1].0.len(), 12_usize); - assert_eq!(train_test_splits[1].1.len(), 11_usize); - } + let results = + cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); - #[test] - fn numpy_parity_test() { - let k = KFold { - n_splits: 3, - shuffle: false, - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test, expected_test); - assert_eq!(train, expected_train); - } - } - - #[test] - fn numpy_parity_test_shuffle() { - let k = KFold { - n_splits: 3, - ..KFold::default() - }; - let x: DenseMatrix = DenseMatrix::rand(10, 4); - let expected: Vec<(Vec, Vec)> = vec![ - (vec![4, 5, 6, 7, 8, 9], vec![0, 1, 2, 3]), - (vec![0, 1, 2, 3, 7, 8, 9], vec![4, 5, 6]), - (vec![0, 1, 2, 3, 4, 5, 6], vec![7, 8, 9]), - ]; - for ((train, test), (expected_train, expected_test)) in - k.split(&x).into_iter().zip(expected) - { - assert_eq!(test.len(), expected_test.len()); - assert_eq!(train.len(), expected_train.len()); - } + println!("{}", results.mean_test_score()); + println!("{}", results.mean_train_score()); } } diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index dd34ae9..fe299f3 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -200,6 +201,12 @@ pub struct BernoulliNB> { binarize: Option, } +impl> Predictor for BernoulliNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> BernoulliNB { /// Fits BernoulliNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index c4626ef..ce526ce 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -30,6 +30,7 @@ //! let nb = CategoricalNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; @@ -246,6 +247,12 @@ pub struct CategoricalNB> { inner: BaseNaiveBayes>, } +impl> Predictor for CategoricalNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> CategoricalNB { /// Fits CategoricalNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index c5c1fb2..01dacd7 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -22,6 +22,7 @@ //! let nb = GaussianNB::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = nb.predict(&x).unwrap(); //! ``` +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -181,6 +182,12 @@ pub struct GaussianNB> { inner: BaseNaiveBayes>, } +impl> Predictor for GaussianNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> GaussianNB { /// Fits GaussianNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index c9ac86b..84d3fd1 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -33,6 +33,7 @@ //! ## References: //! //! * ["Introduction to Information Retrieval", Manning C. D., Raghavan P., Schutze H., 2009, Chapter 13 ](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::row_iter; use crate::linalg::BaseVector; @@ -187,6 +188,12 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } +impl> Predictor for MultinomialNB { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl> MultinomialNB { /// Fits MultinomialNB with given data /// * `x` - training data of size NxM where N is the number of samples and M is the number of diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index f940211..8b4db1b 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -25,31 +25,40 @@ //! &[9., 10.]]); //! let y = vec![2., 2., 2., 3., 3.]; //your class labels //! -//! let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold a vector with estimates of class labels //! +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNClassifier` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNClassifierParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNClassifierParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Classifier @@ -62,12 +71,39 @@ pub struct KNNClassifier, T>> { k: usize, } -impl Default for KNNClassifierParameters { +impl, T>> KNNClassifierParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNClassifierParameters { fn default() -> Self { KNNClassifierParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -95,19 +131,23 @@ impl, T>> PartialEq for KNNClassifier { } } +impl, D: Distance, T>> Predictor + for KNNClassifier +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNClassifier { /// Fits KNN classifier to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with target values (classes) of length N - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with target values (classes) of length N /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNClassifierParameters, + parameters: KNNClassifierParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -142,7 +182,7 @@ impl, T>> KNNClassifier { classes, y: yi, k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -187,14 +227,13 @@ impl, T>> KNNClassifier { mod tests { use super::*; use crate::linalg::naive::dense_matrix::DenseMatrix; - use crate::math::distance::Distances; #[test] fn knn_fit_predict() { let x = DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); assert_eq!(y.to_vec(), y_hat); @@ -207,12 +246,10 @@ mod tests { let knn = KNNClassifier::fit( &x, &y, - Distances::euclidian(), - KNNClassifierParameters { - k: 5, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNClassifierParameters::default() + .with_k(5) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&DenseMatrix::from_2d_array(&[&[4.1]])).unwrap(); @@ -225,7 +262,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![2., 2., 2., 3., 3.]; - let knn = KNNClassifier::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNClassifier::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index b7c0f2d..a97fdea 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -27,31 +27,41 @@ //! &[5., 5.]]); //! let y = vec![1., 2., 3., 4., 5.]; //your target values //! -//! let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); +//! let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); //! let y_hat = knn.predict(&x).unwrap(); //! ``` //! //! variable `y_hat` will hold predicted value //! //! +use std::marker::PhantomData; + use serde::{Deserialize, Serialize}; use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::{row_iter, BaseVector, Matrix}; -use crate::math::distance::Distance; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; use crate::math::num::RealNumber; use crate::neighbors::KNNWeightFunction; /// `KNNRegressor` parameters. Use `Default::default()` for default values. -#[derive(Serialize, Deserialize, Debug)] -pub struct KNNRegressorParameters { +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct KNNRegressorParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + distance: D, /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub algorithm: KNNAlgorithmName, /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. pub weight: KNNWeightFunction, /// number of training samples to consider when estimating class for new point. Default value is 3. pub k: usize, + /// this parameter is not used + t: PhantomData, } /// K Nearest Neighbors Regressor @@ -63,12 +73,39 @@ pub struct KNNRegressor, T>> { k: usize, } -impl Default for KNNRegressorParameters { +impl, T>> KNNRegressorParameters { + /// number of training samples to consider when estimating class for new point. Default value is 3. + pub fn with_k(mut self, k: usize) -> Self { + self.k = k; + self + } + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance(mut self, distance: D) -> Self { + self.distance = distance; + self + } + /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } + /// weighting function that is used to calculate estimated class value. Default function is `KNNWeightFunction::Uniform`. + pub fn with_weight(mut self, weight: KNNWeightFunction) -> Self { + self.weight = weight; + self + } +} + +impl Default for KNNRegressorParameters { fn default() -> Self { KNNRegressorParameters { + distance: Distances::euclidian(), algorithm: KNNAlgorithmName::CoverTree, weight: KNNWeightFunction::Uniform, k: 3, + t: PhantomData, } } } @@ -88,19 +125,23 @@ impl, T>> PartialEq for KNNRegressor { } } +impl, D: Distance, T>> Predictor + for KNNRegressor +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, T>> KNNRegressor { /// Fits KNN regressor to a NxM matrix where N is number of samples and M is number of features. /// * `x` - training data - /// * `y` - vector with real values - /// * `distance` - a function that defines a distance between each pair of point in training data. - /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. - /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + /// * `y` - vector with real values /// * `parameters` - additional parameters like search algorithm and k pub fn fit>( x: &M, y: &M::RowVector, - distance: D, - parameters: KNNRegressorParameters, + parameters: KNNRegressorParameters, ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); @@ -126,7 +167,7 @@ impl, T>> KNNRegressor { Ok(KNNRegressor { y: y.to_vec(), k: parameters.k, - knn_algorithm: parameters.algorithm.fit(data, distance)?, + knn_algorithm: parameters.algorithm.fit(data, parameters.distance)?, weight: parameters.weight, }) } @@ -176,12 +217,11 @@ mod tests { let knn = KNNRegressor::fit( &x, &y, - Distances::euclidian(), - KNNRegressorParameters { - k: 3, - algorithm: KNNAlgorithmName::LinearSearch, - weight: KNNWeightFunction::Distance, - }, + KNNRegressorParameters::default() + .with_k(3) + .with_distance(Distances::euclidian()) + .with_algorithm(KNNAlgorithmName::LinearSearch) + .with_weight(KNNWeightFunction::Distance), ) .unwrap(); let y_hat = knn.predict(&x).unwrap(); @@ -197,7 +237,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y: Vec = vec![1., 2., 3., 4., 5.]; let y_exp = vec![2., 2., 3., 4., 4.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let y_hat = knn.predict(&x).unwrap(); assert_eq!(5, Vec::len(&y_hat)); for i in 0..y_hat.len() { @@ -211,7 +251,7 @@ mod tests { DenseMatrix::from_2d_array(&[&[1., 2.], &[3., 4.], &[5., 6.], &[7., 8.], &[9., 10.]]); let y = vec![1., 2., 3., 4., 5.]; - let knn = KNNRegressor::fit(&x, &y, Distances::euclidian(), Default::default()).unwrap(); + let knn = KNNRegressor::fit(&x, &y, Default::default()).unwrap(); let deserialized_knn = bincode::deserialize(&bincode::serialize(&knn).unwrap()).unwrap(); diff --git a/src/neighbors/mod.rs b/src/neighbors/mod.rs index be1ad4d..85ea6b8 100644 --- a/src/neighbors/mod.rs +++ b/src/neighbors/mod.rs @@ -48,7 +48,7 @@ pub mod knn_regressor; pub type KNNAlgorithmName = crate::algorithm::neighbour::KNNAlgorithmName; /// Weight function that is used to determine estimated value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub enum KNNWeightFunction { /// All k nearest points are weighted equally Uniform, diff --git a/src/svm/mod.rs b/src/svm/mod.rs index 1f563c1..1e013d2 100644 --- a/src/svm/mod.rs +++ b/src/svm/mod.rs @@ -93,16 +93,18 @@ impl Kernels { } /// Linear Kernel -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct LinearKernel {} /// Radial basis function (Gaussian) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RBFKernel { /// kernel coefficient pub gamma: T, } /// Polynomial kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct PolynomialKernel { /// degree of the polynomial pub degree: T, @@ -113,6 +115,7 @@ pub struct PolynomialKernel { } /// Sigmoid (hyperbolic tangent) kernel +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct SigmoidKernel { /// kernel coefficient pub gamma: T, diff --git a/src/svm/svc.rs b/src/svm/svc.rs index 9e166d5..cbe97f7 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -57,13 +57,7 @@ //! let y = vec![ 0., 0., 0., 0., 0., 0., 0., 0., //! 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]; //! -//! let svr = SVC::fit(&x, &y, -//! Kernels::linear(), -//! SVCParameters { -//! epoch: 2, -//! c: 200.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVC::fit(&x, &y, SVCParameters::default().with_c(200.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -84,22 +78,26 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVC Parameters -pub struct SVCParameters { - /// Number of epochs +pub struct SVCParameters, K: Kernel> { + /// Number of epochs. pub epoch: usize, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -136,7 +134,7 @@ struct Cache<'a, T: RealNumber, M: Matrix, K: Kernel> { struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { x: &'a M, y: &'a M::RowVector, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, svmin: usize, svmax: usize, gmin: T, @@ -147,27 +145,61 @@ struct Optimizer<'a, T: RealNumber, M: Matrix, K: Kernel> { recalculate_minmax_grad: bool, } -impl Default for SVCParameters { +impl, K: Kernel> SVCParameters { + /// Number of epochs. + pub fn with_epoch(mut self, epoch: usize) -> Self { + self.epoch = epoch; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVCParameters { + SVCParameters { + epoch: self.epoch, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVCParameters { fn default() -> Self { SVCParameters { epoch: 2, c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVC { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVC { /// Fits SVC to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - class labels - /// * `kernel` - the kernel function /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, y: &M::RowVector, - kernel: K, - parameters: SVCParameters, + parameters: SVCParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -198,13 +230,13 @@ impl, K: Kernel> SVC { } } - let optimizer = Optimizer::new(x, &y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, &y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.optimize(); Ok(SVC { classes, - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -321,7 +353,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &'a M, y: &'a M::RowVector, kernel: &'a K, - parameters: &'a SVCParameters, + parameters: &'a SVCParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -711,17 +743,10 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::linear(), - SVCParameters { - epoch: 2, - c: 200.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); - - println!("{:?}", y_hat); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -759,12 +784,7 @@ mod tests { let y_hat = SVC::fit( &x, &y, - Kernels::rbf(0.7), - SVCParameters { - epoch: 2, - c: 1.0, - tol: 1e-3, - }, + SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -801,7 +821,7 @@ mod tests { -1., -1., -1., -1., -1., -1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; - let svr = SVC::fit(&x, &y, Kernels::linear(), Default::default()).unwrap(); + let svr = SVC::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVC, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 5d007d7..25c7ff6 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -49,13 +49,7 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let svr = SVR::fit(&x, &y, -//! LinearKernel {}, -//! SVRParameters { -//! eps: 2.0, -//! c: 10.0, -//! tol: 1e-3, -//! }).unwrap(); +//! let svr = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)).unwrap(); //! //! let y_hat = svr.predict(&x).unwrap(); //! ``` @@ -72,25 +66,30 @@ use std::cell::{Ref, RefCell}; use std::fmt::Debug; +use std::marker::PhantomData; use serde::{Deserialize, Serialize}; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::svm::Kernel; - -#[derive(Serialize, Deserialize, Debug)] +use crate::svm::{Kernel, Kernels, LinearKernel}; +#[derive(Serialize, Deserialize, Debug, Clone)] /// SVR Parameters -pub struct SVRParameters { - /// Epsilon in the epsilon-SVR model +pub struct SVRParameters, K: Kernel> { + /// Epsilon in the epsilon-SVR model. pub eps: T, /// Regularization parameter. pub c: T, - /// Tolerance for stopping criterion + /// Tolerance for stopping criterion. pub tol: T, + /// The kernel function. + pub kernel: K, + /// Unused parameter. + m: PhantomData, } #[derive(Serialize, Deserialize, Debug)] @@ -135,16 +134,52 @@ struct Cache { data: Vec>>>, } -impl Default for SVRParameters { +impl, K: Kernel> SVRParameters { + /// Epsilon in the epsilon-SVR model. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// Regularization parameter. + pub fn with_c(mut self, c: T) -> Self { + self.c = c; + self + } + /// Tolerance for stopping criterion. + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The kernel function. + pub fn with_kernel>(&self, kernel: KK) -> SVRParameters { + SVRParameters { + eps: self.eps, + c: self.c, + tol: self.tol, + kernel: kernel, + m: PhantomData + } + } +} + +impl> Default for SVRParameters { fn default() -> Self { SVRParameters { eps: T::from_f64(0.1).unwrap(), c: T::one(), tol: T::from_f64(1e-3).unwrap(), + kernel: Kernels::linear(), + m: PhantomData } } } +impl, K: Kernel> Predictor for SVR { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl, K: Kernel> SVR { /// Fits SVR to your data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. @@ -153,9 +188,8 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, - kernel: K, - parameters: SVRParameters, + y: &M::RowVector, + parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -165,12 +199,12 @@ impl, K: Kernel> SVR { )); } - let optimizer = Optimizer::new(x, y, &kernel, ¶meters); + let optimizer = Optimizer::new(x, y, ¶meters.kernel, ¶meters); let (support_vectors, weight, b) = optimizer.smo(); Ok(SVR { - kernel, + kernel: parameters.kernel, instances: support_vectors, w: weight, b, @@ -243,7 +277,7 @@ impl<'a, T: RealNumber, M: Matrix, K: Kernel> Optimizer<'a, x: &M, y: &M::RowVector, kernel: &'a K, - parameters: &SVRParameters, + parameters: &SVRParameters, ) -> Optimizer<'a, T, M, K> { let (n, _) = x.shape(); @@ -513,12 +547,7 @@ mod tests { let y_hat = SVR::fit( &x, &y, - LinearKernel {}, - SVRParameters { - eps: 2.0, - c: 10.0, - tol: 1e-3, - }, + SVRParameters::default().with_eps(2.0).with_c(10.0), ) .and_then(|lr| lr.predict(&x)) .unwrap(); @@ -552,7 +581,7 @@ mod tests { 114.2, 115.7, 116.9, ]; - let svr = SVR::fit(&x, &y, LinearKernel {}, Default::default()).unwrap(); + let svr = SVR::fit(&x, &y, Default::default()).unwrap(); let deserialized_svr: SVR, LinearKernel> = serde_json::from_str(&serde_json::to_string(&svr).unwrap()).unwrap(); diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 371bc4e..1845d5e 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -71,11 +71,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Decision Tree pub struct DecisionTreeClassifierParameters { /// Split criteria to use when building a tree. @@ -269,6 +270,12 @@ pub(in crate) fn which_max(x: &[usize]) -> usize { which } +impl> Predictor for DecisionTreeClassifier { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeClassifier { /// Build a decision tree classifier from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 5e80b4c..492f0a1 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -66,11 +66,12 @@ use rand::seq::SliceRandom; use serde::{Deserialize, Serialize}; use crate::algorithm::sort::quick_sort::QuickArgSort; +use crate::base::Predictor; use crate::error::Failed; use crate::linalg::Matrix; use crate::math::num::RealNumber; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] /// Parameters of Regression Tree pub struct DecisionTreeRegressorParameters { /// The maximum depth of the tree. @@ -189,6 +190,12 @@ impl<'a, T: RealNumber, M: Matrix> NodeVisitor<'a, T, M> { } } +impl> Predictor for DecisionTreeRegressor { + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + impl DecisionTreeRegressor { /// Build a decision tree regressor from the training data. /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. From 9b221979da51f9a26c693f5f5300599939416df6 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 16:35:28 -0800 Subject: [PATCH 2/7] fix: clippy, documentation and formatting --- src/linalg/mod.rs | 10 +++---- src/linear/logistic_regression.rs | 12 +++++---- src/model_selection/kfold.rs | 29 +++++---------------- src/model_selection/mod.rs | 43 ++++++++++++++++++++++++++----- src/naive_bayes/multinomial.rs | 2 +- src/svm/svc.rs | 20 +++++++++----- src/svm/svr.rs | 26 +++++++++---------- 7 files changed, 80 insertions(+), 62 deletions(-) diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index 5b49942..264815b 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -281,8 +281,8 @@ pub trait BaseVector: Clone + Debug { let mut result = Self::zeros(n); - for i in 0..n { - result.set(i, self.get(index[i])); + for (i, idx) in index.iter().enumerate() { + result.set(i, self.get(*idx)); } result @@ -639,11 +639,11 @@ pub trait BaseMatrix: Clone + Debug { _ => Self::zeros(n, index.len()), }; - for i in 0..index.len() { + for (i, idx) in index.iter().enumerate() { for j in 0..k { match axis { - 0 => result.set(i, j, self.get(index[i], j)), - _ => result.set(j, i, self.get(j, index[i])), + 0 => result.set(i, j, self.get(*idx, j)), + _ => result.set(j, i, self.get(j, *idx)), }; } } diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index b85bbe8..ffb845c 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder; /// Logistic Regression parameters #[derive(Serialize, Deserialize, Debug, Clone)] -pub struct LogisticRegressionParameters { -} +pub struct LogisticRegressionParameters {} /// Logistic Regression #[derive(Serialize, Deserialize, Debug)] @@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix> { impl Default for LogisticRegressionParameters { fn default() -> Self { - LogisticRegressionParameters { - } + LogisticRegressionParameters {} } } @@ -231,7 +229,11 @@ impl> LogisticRegression { /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. /// * `y` - target class values /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. - pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result, Failed> { + pub fn fit( + x: &M, + y: &M::RowVector, + _parameters: LogisticRegressionParameters, + ) -> Result, Failed> { let y_m = M::from_row_vector(y.clone()); let (x_nrows, num_attributes) = x.shape(); let (_, y_nrows) = y_m.shape(); diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs index 0fbe224..63827c4 100644 --- a/src/model_selection/kfold.rs +++ b/src/model_selection/kfold.rs @@ -1,30 +1,13 @@ //! # KFold //! -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. -//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data. -//! Underfitted is bad because the model is undetrained and does not fit the training data well. -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for -//! your data. -//! -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. +//! Defines k-fold cross validator. use crate::linalg::Matrix; use crate::math::num::RealNumber; +use crate::model_selection::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -/// An interface for the K-Folds cross-validator -pub trait BaseKFold { - /// An iterator over indices that split data into training and test set. - type Output: Iterator, Vec)>; - /// Return a tuple containing the the training set indices for that split and - /// the testing set indices for that split. - fn split>(&self, x: &M) -> Self::Output; - /// Returns the number of splits - fn n_splits(&self) -> usize; -} - /// K-Folds cross-validator pub struct KFold { /// Number of folds. Must be at least 2. @@ -101,12 +84,12 @@ impl KFold { } /// An iterator over indices that split data into training and test set. -pub struct BaseKFoldIter { +pub struct KFoldIter { indices: Vec, test_indices: Vec>, } -impl Iterator for BaseKFoldIter { +impl Iterator for KFoldIter { type Item = (Vec, Vec); fn next(&mut self) -> Option<(Vec, Vec)> { @@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter { /// Abstract class for all KFold functionalities impl BaseKFold for KFold { - type Output = BaseKFoldIter; + type Output = KFoldIter; fn n_splits(&self) -> usize { self.n_splits @@ -148,7 +131,7 @@ impl BaseKFold for KFold { let mut test_indices = self.test_masks(x); test_indices.reverse(); - BaseKFoldIter { + KFoldIter { indices, test_indices, } diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 64527b3..0aabb97 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -14,15 +14,27 @@ use crate::error::Failed; use crate::linalg::BaseVector; use crate::linalg::Matrix; use crate::math::num::RealNumber; -use crate::model_selection::kfold::BaseKFold; use rand::seq::SliceRandom; use rand::thread_rng; -pub mod kfold; +pub(crate) mod kfold; + +pub use kfold::{KFold, KFoldIter}; + +/// An interface for the K-Folds cross-validator +pub trait BaseKFold { + /// An iterator over indices that split data into training and test set. + type Output: Iterator, Vec)>; + /// Return a tuple containing the the training set indices for that split and + /// the testing set indices for that split. + fn split>(&self, x: &M) -> Self::Output; + /// Returns the number of splits + fn n_splits(&self) -> usize; +} /// Splits data into 2 disjoint datasets. /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. -/// * `y` - target values, should be of size _M_ +/// * `y` - target values, should be of size _N_ /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split. /// * `shuffle`, - whether or not to shuffle the data before splitting pub fn train_test_split>( @@ -65,22 +77,33 @@ pub fn train_test_split>( (x_train, x_test, y_train, y_test) } +/// Cross validation results. #[derive(Clone, Debug)] pub struct CrossValidationResult { + /// Vector with test scores on each cv split pub test_score: Vec, + /// Vector with training scores on each cv split pub train_score: Vec, } impl CrossValidationResult { + /// Average test score pub fn mean_test_score(&self) -> T { self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap() } - + /// Average training score pub fn mean_train_score(&self) -> T { self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap() } } +/// Evaluate an estimator by cross-validation using given metric. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html) pub fn cross_validate( fit_estimator: F, x: &M, @@ -302,7 +325,6 @@ mod tests { #[test] fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ &[5.1, 3.5, 1.4, 0.2], &[4.9, 3.0, 1.4, 0.2], @@ -334,8 +356,15 @@ mod tests { ..KFold::default() }; - let results = - cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap(); + let results = cross_validate( + DecisionTreeClassifier::fit, + &x, + &y, + Default::default(), + cv, + &accuracy, + ) + .unwrap(); println!("{}", results.mean_test_score()); println!("{}", results.mean_train_score()); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 84d3fd1..849b8db 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -188,7 +188,7 @@ pub struct MultinomialNB> { inner: BaseNaiveBayes>, } -impl> Predictor for MultinomialNB { +impl> Predictor for MultinomialNB { fn predict(&self, x: &M) -> Result { self.predict(x) } diff --git a/src/svm/svc.rs b/src/svm/svc.rs index cbe97f7..aee4d3f 100644 --- a/src/svm/svc.rs +++ b/src/svm/svc.rs @@ -167,8 +167,8 @@ impl, K: Kernel> SVCParameters> Default for SVCParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVC { +impl, K: Kernel> Predictor + for SVC +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -743,10 +745,12 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()), + SVCParameters::default() + .with_c(200.0) + .with_kernel(Kernels::linear()), ) .and_then(|lr| lr.predict(&x)) - .unwrap(); + .unwrap(); assert!(accuracy(&y_hat, &y) >= 0.9); } @@ -784,7 +788,9 @@ mod tests { let y_hat = SVC::fit( &x, &y, - SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)), + SVCParameters::default() + .with_c(1.0) + .with_kernel(Kernels::rbf(0.7)), ) .and_then(|lr| lr.predict(&x)) .unwrap(); diff --git a/src/svm/svr.rs b/src/svm/svr.rs index 25c7ff6..295ad78 100644 --- a/src/svm/svr.rs +++ b/src/svm/svr.rs @@ -134,7 +134,7 @@ struct Cache { data: Vec>>>, } -impl, K: Kernel> SVRParameters { +impl, K: Kernel> SVRParameters { /// Epsilon in the epsilon-SVR model. pub fn with_eps(mut self, eps: T) -> Self { self.eps = eps; @@ -153,11 +153,11 @@ impl, K: Kernel> SVRParameters>(&self, kernel: KK) -> SVRParameters { SVRParameters { - eps: self.eps, + eps: self.eps, c: self.c, tol: self.tol, - kernel: kernel, - m: PhantomData + kernel, + m: PhantomData, } } } @@ -169,12 +169,14 @@ impl> Default for SVRParameters c: T::one(), tol: T::from_f64(1e-3).unwrap(), kernel: Kernels::linear(), - m: PhantomData + m: PhantomData, } } } -impl, K: Kernel> Predictor for SVR { +impl, K: Kernel> Predictor + for SVR +{ fn predict(&self, x: &M) -> Result { self.predict(x) } @@ -188,7 +190,7 @@ impl, K: Kernel> SVR { /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values. pub fn fit( x: &M, - y: &M::RowVector, + y: &M::RowVector, parameters: SVRParameters, ) -> Result, Failed> { let (n, _) = x.shape(); @@ -544,13 +546,9 @@ mod tests { 114.2, 115.7, 116.9, ]; - let y_hat = SVR::fit( - &x, - &y, - SVRParameters::default().with_eps(2.0).with_c(10.0), - ) - .and_then(|lr| lr.predict(&x)) - .unwrap(); + let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0)) + .and_then(|lr| lr.predict(&x)) + .unwrap(); assert!(mean_squared_error(&y_hat, &y) < 2.5); } From f685f575e068080b64d660ebe34261f3556ffee7 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:42:18 -0800 Subject: [PATCH 3/7] feat: + cross_val_predict --- src/model_selection/mod.rs | 105 +++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 0aabb97..7178da8 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -125,7 +125,7 @@ where let mut test_score = Vec::with_capacity(k); let mut train_score = Vec::with_capacity(k); - for (test_idx, train_idx) in cv.split(x) { + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); let test_x = x.take(&test_idx, 0); @@ -143,6 +143,46 @@ where }) } +/// Generate cross-validated estimates for each input data point. +/// The data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set. +/// * `fit_estimator` - a `fit` function of an estimator +/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes. +/// * `y` - target values, should be of size _N_ +/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters. +/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html) +pub fn cross_val_predict( + fit_estimator: F, + x: &M, + y: &M::RowVector, + parameters: H, + cv: K +) -> Result +where + T: RealNumber, + M: Matrix, + H: Clone, + E: Predictor, + K: BaseKFold, + F: Fn(&M, &M::RowVector, H) -> Result +{ + let mut y_hat = M::RowVector::zeros(y.len()); + + for (train_idx, test_idx) in cv.split(x) { + let train_x = x.take(&train_idx, 0); + let train_y = y.take(&train_idx); + let test_x = x.take(&test_idx, 0); + + let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; + + let y_test_hat = estimator.predict(&test_x)?; + for (i, &idx) in test_idx.iter().enumerate() { + y_hat.set(idx, y_test_hat.get(i)); + } + } + + Ok(y_hat) +} + #[cfg(test)] mod tests { @@ -278,10 +318,8 @@ mod tests { assert!(results.mean_train_score() < results.mean_test_score()); } - use crate::tree::decision_tree_regressor::*; - #[test] - fn test_some_regressor() { + fn test_cross_val_predict_knn() { let x = DenseMatrix::from_2d_array(&[ &[234.289, 235.6, 159., 107.608, 1947., 60.323], &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], @@ -305,68 +343,21 @@ mod tests { 114.2, 115.7, 116.9, ]; - let cv = KFold::default().with_n_splits(2); - - let results = cross_validate( - DecisionTreeRegressor::fit, - &x, - &y, - Default::default(), - cv, - &mean_absolute_error, - ) - .unwrap(); - - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); - } - - use crate::tree::decision_tree_classifier::*; - - #[test] - fn test_some_classifier() { - let x = DenseMatrix::from_2d_array(&[ - &[5.1, 3.5, 1.4, 0.2], - &[4.9, 3.0, 1.4, 0.2], - &[4.7, 3.2, 1.3, 0.2], - &[4.6, 3.1, 1.5, 0.2], - &[5.0, 3.6, 1.4, 0.2], - &[5.4, 3.9, 1.7, 0.4], - &[4.6, 3.4, 1.4, 0.3], - &[5.0, 3.4, 1.5, 0.2], - &[4.4, 2.9, 1.4, 0.2], - &[4.9, 3.1, 1.5, 0.1], - &[7.0, 3.2, 4.7, 1.4], - &[6.4, 3.2, 4.5, 1.5], - &[6.9, 3.1, 4.9, 1.5], - &[5.5, 2.3, 4.0, 1.3], - &[6.5, 2.8, 4.6, 1.5], - &[5.7, 2.8, 4.5, 1.3], - &[6.3, 3.3, 4.7, 1.6], - &[4.9, 2.4, 3.3, 1.0], - &[6.6, 2.9, 4.6, 1.3], - &[5.2, 2.7, 3.9, 1.4], - ]); - let y = vec![ - 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., - ]; - let cv = KFold { n_splits: 2, ..KFold::default() }; - let results = cross_validate( - DecisionTreeClassifier::fit, + let y_hat = cross_val_predict( + KNNRegressor::fit, &x, &y, Default::default(), - cv, - &accuracy, + cv ) - .unwrap(); + .unwrap(); - println!("{}", results.mean_test_score()); - println!("{}", results.mean_train_score()); + assert!(mean_absolute_error(&y, &y_hat) < 10.0); } + } From 74f0d9e6fb574196cd84bc7d82169ad8a96cb910 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Tue, 22 Dec 2020 17:44:44 -0800 Subject: [PATCH 4/7] fix: formatting --- src/model_selection/mod.rs | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs index 7178da8..7776354 100644 --- a/src/model_selection/mod.rs +++ b/src/model_selection/mod.rs @@ -155,7 +155,7 @@ pub fn cross_val_predict( x: &M, y: &M::RowVector, parameters: H, - cv: K + cv: K, ) -> Result where T: RealNumber, @@ -163,14 +163,14 @@ where H: Clone, E: Predictor, K: BaseKFold, - F: Fn(&M, &M::RowVector, H) -> Result -{ - let mut y_hat = M::RowVector::zeros(y.len()); - + F: Fn(&M, &M::RowVector, H) -> Result, +{ + let mut y_hat = M::RowVector::zeros(y.len()); + for (train_idx, test_idx) in cv.split(x) { let train_x = x.take(&train_idx, 0); let train_y = y.take(&train_idx); - let test_x = x.take(&test_idx, 0); + let test_x = x.take(&test_idx, 0); let estimator = fit_estimator(&train_x, &train_y, parameters.clone())?; @@ -348,16 +348,8 @@ mod tests { ..KFold::default() }; - let y_hat = cross_val_predict( - KNNRegressor::fit, - &x, - &y, - Default::default(), - cv - ) - .unwrap(); + let y_hat = cross_val_predict(KNNRegressor::fit, &x, &y, Default::default(), cv).unwrap(); assert!(mean_absolute_error(&y, &y_hat) < 10.0); } - } From dd341f4a12a8638f2f5538bc2fa68b5d2ca779de Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:29:39 -0800 Subject: [PATCH 5/7] feat: + builders for algorithm parameters --- src/cluster/dbscan.rs | 22 ++++++++++++++-- src/cluster/kmeans.rs | 8 ++++++ src/decomposition/pca.rs | 9 +++++++ src/ensemble/random_forest_classifier.rs | 33 ++++++++++++++++++++++++ src/ensemble/random_forest_regressor.rs | 28 ++++++++++++++++++++ src/linear/elastic_net.rs | 30 +++++++++++++++++++++ src/linear/lasso.rs | 23 +++++++++++++++++ src/linear/linear_regression.rs | 8 ++++++ src/linear/ridge_regression.rs | 18 +++++++++++++ src/naive_bayes/bernoulli.rs | 15 +++++++++++ src/naive_bayes/categorical.rs | 6 +++++ src/naive_bayes/gaussian.rs | 5 ++++ src/naive_bayes/multinomial.rs | 10 +++++++ src/neighbors/knn_classifier.rs | 14 +++++++--- src/neighbors/knn_regressor.rs | 14 +++++++--- src/tree/decision_tree_classifier.rs | 23 +++++++++++++++++ src/tree/decision_tree_regressor.rs | 18 +++++++++++++ 17 files changed, 276 insertions(+), 8 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index e595028..ac095f6 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -53,14 +53,32 @@ pub struct DBSCAN, T>> { #[derive(Debug, Clone)] /// DBSCAN clustering algorithm parameters pub struct DBSCANParameters { - /// Maximum number of iterations of the k-means algorithm for a single run. + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. pub min_samples: usize, - /// The number of samples in a neighborhood for a point to be considered as a core point. + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. pub eps: T, /// KNN algorithm to use. pub algorithm: KNNAlgorithmName, } +impl DBSCANParameters { + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + pub fn with_min_samples(mut self, min_samples: usize) -> Self { + self.min_samples = min_samples; + self + } + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// KNN algorithm to use. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } +} + impl, T>> PartialEq for DBSCAN { fn eq(&self, other: &Self) -> bool { self.cluster_labels.len() == other.cluster_labels.len() diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 26a4038..bc5d673 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -105,6 +105,14 @@ pub struct KMeansParameters { pub max_iter: usize, } +impl KMeansParameters { + /// Maximum number of iterations of the k-means algorithm for a single run. + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for KMeansParameters { fn default() -> Self { KMeansParameters { max_iter: 100 } diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 7d80f88..68220e3 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -88,6 +88,15 @@ pub struct PCAParameters { pub use_correlation_matrix: bool, } +impl PCAParameters { + /// By default, covariance matrix is used to compute principal components. + /// Enable this flag if you want to use correlation matrix instead. + pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self { + self.use_correlation_matrix = use_correlation_matrix; + self + } +} + impl Default for PCAParameters { fn default() -> Self { PCAParameters { diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs index a742d90..9f1ba72 100644 --- a/src/ensemble/random_forest_classifier.rs +++ b/src/ensemble/random_forest_classifier.rs @@ -85,6 +85,39 @@ pub struct RandomForestClassifier { classes: Vec, } +impl RandomForestClassifierParameters { + /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: u16) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl PartialEq for RandomForestClassifier { fn eq(&self, other: &Self) -> bool { if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() { diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs index 52b39f9..6aa89d0 100644 --- a/src/ensemble/random_forest_regressor.rs +++ b/src/ensemble/random_forest_regressor.rs @@ -80,6 +80,34 @@ pub struct RandomForestRegressor { trees: Vec>, } +impl RandomForestRegressorParameters { + /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html) + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } + /// The number of trees in the forest. + pub fn with_n_trees(mut self, n_trees: usize) -> Self { + self.n_trees = n_trees; + self + } + /// Number of random sample of predictors to use as split candidates. + pub fn with_m(mut self, m: usize) -> Self { + self.m = Some(m); + self + } +} + impl Default for RandomForestRegressorParameters { fn default() -> Self { RandomForestRegressorParameters { diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs index b386290..1ab933a 100644 --- a/src/linear/elastic_net.rs +++ b/src/linear/elastic_net.rs @@ -90,6 +90,36 @@ pub struct ElasticNet> { intercept: T, } +impl ElasticNetParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1. + /// For l1_ratio = 0 the penalty is an L2 penalty. + /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + pub fn with_l1_ratio(mut self, l1_ratio: T) -> Self { + self.l1_ratio = l1_ratio; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for ElasticNetParameters { fn default() -> Self { ElasticNetParameters { diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs index 0dab3e5..e16a316 100644 --- a/src/linear/lasso.rs +++ b/src/linear/lasso.rs @@ -54,6 +54,29 @@ pub struct Lasso> { intercept: T, } +impl LassoParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } + /// The tolerance for the optimization + pub fn with_tol(mut self, tol: T) -> Self { + self.tol = tol; + self + } + /// The maximum number of iterations + pub fn with_max_iter(mut self, max_iter: usize) -> Self { + self.max_iter = max_iter; + self + } +} + impl Default for LassoParameters { fn default() -> Self { LassoParameters { diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index c7bd872..0ebad34 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -93,6 +93,14 @@ pub struct LinearRegression> { solver: LinearRegressionSolverName, } +impl LinearRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: LinearRegressionSolverName) -> Self { + self.solver = solver; + self + } +} + impl Default for LinearRegressionParameters { fn default() -> Self { LinearRegressionParameters { diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 2b5a898..5c14313 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -98,6 +98,24 @@ pub struct RidgeRegression> { solver: RidgeRegressionSolverName, } +impl RidgeRegressionParameters { + /// Regularization parameter. + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Solver to use for estimation of regression coefficients. + pub fn with_solver(mut self, solver: RidgeRegressionSolverName) -> Self { + self.solver = solver; + self + } + /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation. + pub fn with_normalize(mut self, normalize: bool) -> Self { + self.normalize = normalize; + self + } +} + impl Default for RidgeRegressionParameters { fn default() -> Self { RidgeRegressionParameters { diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index fe299f3..db98efc 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -96,6 +96,21 @@ impl BernoulliNBParameters { binarize, } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } + /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. + pub fn with_binarize(mut self, binarize: T) -> Self { + self.binarize = Some(binarize); + self + } } impl Default for BernoulliNBParameters { diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ce526ce..ea81eb5 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -234,7 +234,13 @@ impl CategoricalNBParameters { ))) } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } } + impl Default for CategoricalNBParameters { fn default() -> Self { Self { alpha: T::one() } diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index 01dacd7..f1fc812 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -86,6 +86,11 @@ impl GaussianNBParameters { pub fn new(priors: Option>) -> Self { Self { priors } } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl GaussianNBDistribution { diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 849b8db..50d2ee2 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -86,6 +86,16 @@ impl MultinomialNBParameters { pub fn new(alpha: T, priors: Option>) -> Self { Self { alpha, priors } } + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + pub fn with_alpha(mut self, alpha: T) -> Self { + self.alpha = alpha; + self + } + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data + pub fn with_priors(mut self, priors: Vec) -> Self { + self.priors = Some(priors); + self + } } impl Default for MultinomialNBParameters { diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs index 8b4db1b..6668539 100644 --- a/src/neighbors/knn_classifier.rs +++ b/src/neighbors/knn_classifier.rs @@ -80,9 +80,17 @@ impl, T>> KNNClassifierParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNClassifierParameters { + KNNClassifierParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs index a97fdea..80971e5 100644 --- a/src/neighbors/knn_regressor.rs +++ b/src/neighbors/knn_regressor.rs @@ -82,9 +82,17 @@ impl, T>> KNNRegressorParameters { /// a function that defines a distance between each pair of point in training data. /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. - pub fn with_distance(mut self, distance: D) -> Self { - self.distance = distance; - self + pub fn with_distance, T>>( + self, + distance: DD, + ) -> KNNRegressorParameters { + KNNRegressorParameters { + distance, + algorithm: self.algorithm, + weight: self.weight, + k: self.k, + t: PhantomData, + } } /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default. pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs index 1845d5e..50a855b 100644 --- a/src/tree/decision_tree_classifier.rs +++ b/src/tree/decision_tree_classifier.rs @@ -161,6 +161,29 @@ impl PartialEq for Node { } } +impl DecisionTreeClassifierParameters { + /// Split criteria to use when building a tree. + pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self { + self.criterion = criterion; + self + } + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeClassifierParameters { fn default() -> Self { DecisionTreeClassifierParameters { diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs index 492f0a1..806e680 100644 --- a/src/tree/decision_tree_regressor.rs +++ b/src/tree/decision_tree_regressor.rs @@ -101,6 +101,24 @@ struct Node { false_child: Option, } +impl DecisionTreeRegressorParameters { + /// The maximum depth of the tree. + pub fn with_max_depth(mut self, max_depth: u16) -> Self { + self.max_depth = Some(max_depth); + self + } + /// The minimum number of samples required to be at a leaf node. + pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self { + self.min_samples_leaf = min_samples_leaf; + self + } + /// The minimum number of samples required to split an internal node. + pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self { + self.min_samples_split = min_samples_split; + self + } +} + impl Default for DecisionTreeRegressorParameters { fn default() -> Self { DecisionTreeRegressorParameters { From 32ae63a577b3a84bcca2dc7472f830b00290f085 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 23 Dec 2020 12:38:10 -0800 Subject: [PATCH 6/7] feat: documentation adjusted to new builder --- src/cluster/dbscan.rs | 8 +++----- src/linear/linear_regression.rs | 6 +++--- src/linear/ridge_regression.rs | 7 ++----- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs index ac095f6..c572ccc 100644 --- a/src/cluster/dbscan.rs +++ b/src/cluster/dbscan.rs @@ -15,11 +15,9 @@ //! let blobs = generator::make_blobs(100, 2, 3); //! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); //! // Fit the algorithm and predict cluster labels -//! let labels = DBSCAN::fit(&x, Distances::euclidian(), DBSCANParameters{ -//! min_samples: 5, -//! eps: 3.0, -//! algorithm: KNNAlgorithmName::CoverTree -//! }).and_then(|dbscan| dbscan.predict(&x)); +//! let labels = DBSCAN::fit(&x, Distances::euclidian(), +//! DBSCANParameters::default().with_eps(3.0)). +//! and_then(|dbscan| dbscan.predict(&x)); //! //! println!("{:?}", labels); //! ``` diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs index 0ebad34..1855673 100644 --- a/src/linear/linear_regression.rs +++ b/src/linear/linear_regression.rs @@ -45,9 +45,9 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let lr = LinearRegression::fit(&x, &y, LinearRegressionParameters { -//! solver: LinearRegressionSolverName::QR, // or SVD -//! }).unwrap(); +//! let lr = LinearRegression::fit(&x, &y, +//! LinearRegressionParameters::default(). +//! with_solver(LinearRegressionSolverName::QR)).unwrap(); //! //! let y_hat = lr.predict(&x).unwrap(); //! ``` diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs index 5c14313..f29898d 100644 --- a/src/linear/ridge_regression.rs +++ b/src/linear/ridge_regression.rs @@ -45,11 +45,8 @@ //! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, //! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; //! -//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters { -//! solver: RidgeRegressionSolverName::Cholesky, -//! alpha: 0.1, -//! normalize: true -//! }).and_then(|lr| lr.predict(&x)).unwrap(); +//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters::default().with_alpha(0.1)). +//! and_then(|lr| lr.predict(&x)).unwrap(); //! ``` //! //! ## References: From d22be7d6ae44c1fddc412fde9ca434070ae890b5 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Thu, 24 Dec 2020 13:47:09 -0800 Subject: [PATCH 7/7] fix: post-review changes --- src/naive_bayes/bernoulli.rs | 8 -------- src/naive_bayes/categorical.rs | 11 ----------- src/naive_bayes/gaussian.rs | 6 +----- src/naive_bayes/multinomial.rs | 4 ---- 4 files changed, 1 insertion(+), 28 deletions(-) diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs index db98efc..c6cbfa8 100644 --- a/src/naive_bayes/bernoulli.rs +++ b/src/naive_bayes/bernoulli.rs @@ -88,14 +88,6 @@ pub struct BernoulliNBParameters { } impl BernoulliNBParameters { - /// Create BernoulliNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>, binarize: Option) -> Self { - Self { - alpha, - priors, - binarize, - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs index ea81eb5..667a270 100644 --- a/src/naive_bayes/categorical.rs +++ b/src/naive_bayes/categorical.rs @@ -223,17 +223,6 @@ pub struct CategoricalNBParameters { } impl CategoricalNBParameters { - /// Create CategoricalNBParameters with specific paramaters. - pub fn new(alpha: T) -> Result { - if alpha > T::zero() { - Ok(Self { alpha }) - } else { - Err(Failed::fit(&format!( - "alpha should be >= 0, alpha=[{}]", - alpha - ))) - } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha; diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs index f1fc812..bc96420 100644 --- a/src/naive_bayes/gaussian.rs +++ b/src/naive_bayes/gaussian.rs @@ -82,10 +82,6 @@ pub struct GaussianNBParameters { } impl GaussianNBParameters { - /// Create GaussianNBParameters with specific paramaters. - pub fn new(priors: Option>) -> Self { - Self { priors } - } /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data pub fn with_priors(mut self, priors: Vec) -> Self { self.priors = Some(priors); @@ -266,7 +262,7 @@ mod tests { let y = vec![1., 1., 1., 2., 2., 2.]; let priors = vec![0.3, 0.7]; - let parameters = GaussianNBParameters::new(Some(priors.clone())); + let parameters = GaussianNBParameters::default().with_priors(priors.clone()); let gnb = GaussianNB::fit(&x, &y, parameters).unwrap(); assert_eq!(gnb.inner.distribution.class_priors, priors); diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs index 50d2ee2..237b606 100644 --- a/src/naive_bayes/multinomial.rs +++ b/src/naive_bayes/multinomial.rs @@ -82,10 +82,6 @@ pub struct MultinomialNBParameters { } impl MultinomialNBParameters { - /// Create MultinomialNBParameters with specific paramaters. - pub fn new(alpha: T, priors: Option>) -> Self { - Self { alpha, priors } - } /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). pub fn with_alpha(mut self, alpha: T) -> Self { self.alpha = alpha;