From ab7f46603c576f7c96feea8a9db09db79d996a54 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Fri, 6 Nov 2020 10:48:00 -0800 Subject: [PATCH] feat: + ridge regression --- src/linalg/mod.rs | 48 +++++ src/linalg/naive/dense_matrix.rs | 3 + src/linalg/nalgebra_bindings.rs | 6 + src/linalg/ndarray_bindings.rs | 6 + src/linalg/stats.rs | 139 +++++++++++++ src/linear/mod.rs | 1 + src/linear/ridge_regression.rs | 323 +++++++++++++++++++++++++++++++ 7 files changed, 526 insertions(+) create mode 100644 src/linalg/stats.rs create mode 100644 src/linear/ridge_regression.rs diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs index fb12909..42ed558 100644 --- a/src/linalg/mod.rs +++ b/src/linalg/mod.rs @@ -48,6 +48,7 @@ pub mod nalgebra_bindings; pub mod ndarray_bindings; /// QR factorization that factors a matrix into a product of an orthogonal matrix and an upper triangular matrix. pub mod qr; +pub mod stats; /// Singular value decomposition. pub mod svd; @@ -60,6 +61,7 @@ use cholesky::CholeskyDecomposableMatrix; use evd::EVDDecomposableMatrix; use lu::LUDecomposableMatrix; use qr::QRDecomposableMatrix; +use stats::MatrixStats; use svd::SVDDecomposableMatrix; /// Column or row vector @@ -163,6 +165,32 @@ pub trait BaseVector: Clone + Debug { ///assert_eq!(a.unique(), vec![-7., -6., -2., 1., 2., 3., 4.]); /// ``` fn unique(&self) -> Vec; + + /// Compute the arithmetic mean. + fn mean(&self) -> T { + let n = self.len(); + let mut mean = T::zero(); + + for i in 0..n { + mean += self.get(i); + } + mean / T::from_usize(n).unwrap() + } + /// Compute the standard deviation. + fn std(&self) -> T { + let n = self.len(); + + let mut mu = T::zero(); + let mut sum = T::zero(); + let div = T::from_usize(n).unwrap(); + for i in 0..n { + let xi = self.get(i); + mu += xi; + sum += xi * xi; + } + mu /= div; + (sum / div - mu * mu).sqrt() + } } /// Generic matrix type. @@ -510,6 +538,7 @@ pub trait Matrix: + QRDecomposableMatrix + LUDecomposableMatrix + CholeskyDecomposableMatrix + + MatrixStats + PartialEq + Display { @@ -545,3 +574,22 @@ impl<'a, T: RealNumber, M: BaseMatrix> Iterator for RowIter<'a, T, M> { res } } + +#[cfg(test)] +mod tests { + use crate::linalg::BaseVector; + + #[test] + fn mean() { + let m = vec![1., 2., 3.]; + + assert_eq!(m.mean(), 2.0); + } + + #[test] + fn std() { + let m = vec![1., 2., 3.]; + + assert!((m.std() - 0.81f64).abs() < 1e-2); + } +} diff --git a/src/linalg/naive/dense_matrix.rs b/src/linalg/naive/dense_matrix.rs index d3d6353..e34dd91 100644 --- a/src/linalg/naive/dense_matrix.rs +++ b/src/linalg/naive/dense_matrix.rs @@ -12,6 +12,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; pub use crate::linalg::{BaseMatrix, BaseVector}; @@ -445,6 +446,8 @@ impl LUDecomposableMatrix for DenseMatrix {} impl CholeskyDecomposableMatrix for DenseMatrix {} +impl MatrixStats for DenseMatrix {} + impl Matrix for DenseMatrix {} impl PartialEq for DenseMatrix { diff --git a/src/linalg/nalgebra_bindings.rs b/src/linalg/nalgebra_bindings.rs index e0b885b..ad39057 100644 --- a/src/linalg/nalgebra_bindings.rs +++ b/src/linalg/nalgebra_bindings.rs @@ -46,6 +46,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix as SmartCoreMatrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -550,6 +551,11 @@ impl + MatrixStats for Matrix> +{ +} + impl SmartCoreMatrix for Matrix> { diff --git a/src/linalg/ndarray_bindings.rs b/src/linalg/ndarray_bindings.rs index 00c9745..e8de983 100644 --- a/src/linalg/ndarray_bindings.rs +++ b/src/linalg/ndarray_bindings.rs @@ -53,6 +53,7 @@ use crate::linalg::cholesky::CholeskyDecomposableMatrix; use crate::linalg::evd::EVDDecomposableMatrix; use crate::linalg::lu::LUDecomposableMatrix; use crate::linalg::qr::QRDecomposableMatrix; +use crate::linalg::stats::MatrixStats; use crate::linalg::svd::SVDDecomposableMatrix; use crate::linalg::Matrix; use crate::linalg::{BaseMatrix, BaseVector}; @@ -500,6 +501,11 @@ impl + MatrixStats for ArrayBase, Ix2> +{ +} + impl Matrix for ArrayBase, Ix2> { diff --git a/src/linalg/stats.rs b/src/linalg/stats.rs new file mode 100644 index 0000000..f5db1e9 --- /dev/null +++ b/src/linalg/stats.rs @@ -0,0 +1,139 @@ +//! # Various Statistical Methods +//! +//! + +use crate::linalg::BaseMatrix; +use crate::math::num::RealNumber; + +/// Defines baseline implementations for various statistical functions +pub trait MatrixStats: BaseMatrix { + /// Compute the arithmetic mean along the specified axis. + fn mean(&self, axis: u8) -> Vec { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + let mut x: Vec = vec![T::zero(); n]; + + let div = T::from_usize(m).unwrap(); + + for i in 0..n { + for j in 0..m { + x[i] += match axis { + 0 => self.get(j, i), + _ => self.get(i, j), + }; + } + x[i] /= div; + } + + x + } + + /// Compute the standard deviation along the specified axis. + fn std(&self, axis: u8) -> Vec { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + let mut x: Vec = vec![T::zero(); n]; + + let div = T::from_usize(m).unwrap(); + + for i in 0..n { + let mut mu = T::zero(); + let mut sum = T::zero(); + for j in 0..m { + let a = match axis { + 0 => self.get(j, i), + _ => self.get(i, j), + }; + mu += a; + sum += a * a; + } + mu /= div; + x[i] = (sum / div - mu * mu).sqrt(); + } + + x + } + + /// standardize values by removing the mean and scaling to unit variance + fn scale_mut(&mut self, mean: &Vec, std: &Vec, axis: u8) { + let (n, m) = match axis { + 0 => { + let (n, m) = self.shape(); + (m, n) + } + _ => self.shape(), + }; + + for i in 0..n { + for j in 0..m { + match axis { + 0 => self.set(j, i, (self.get(j, i) - mean[i]) / std[i]), + _ => self.set(i, j, (self.get(i, j) - mean[i]) / std[i]), + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + use crate::linalg::BaseVector; + + #[test] + fn mean() { + let m = DenseMatrix::from_2d_array(&[ + &[1., 2., 3., 1., 2.], + &[4., 5., 6., 3., 4.], + &[7., 8., 9., 5., 6.], + ]); + let expected_0 = vec![4., 5., 6., 3., 4.]; + let expected_1 = vec![1.8, 4.4, 7.]; + + assert_eq!(m.mean(0), expected_0); + assert_eq!(m.mean(1), expected_1); + } + + #[test] + fn std() { + let m = DenseMatrix::from_2d_array(&[ + &[1., 2., 3., 1., 2.], + &[4., 5., 6., 3., 4.], + &[7., 8., 9., 5., 6.], + ]); + let expected_0 = vec![2.44, 2.44, 2.44, 1.63, 1.63]; + let expected_1 = vec![0.74, 1.01, 1.41]; + + assert!(m.std(0).approximate_eq(&expected_0, 1e-2)); + assert!(m.std(1).approximate_eq(&expected_1, 1e-2)); + } + + #[test] + fn scale() { + let mut m = DenseMatrix::from_2d_array(&[&[1., 2., 3.], &[4., 5., 6.]]); + let expected_0 = DenseMatrix::from_2d_array(&[&[-1., -1., -1.], &[1., 1., 1.]]); + let expected_1 = DenseMatrix::from_2d_array(&[&[-1.22, 0.0, 1.22], &[-1.22, 0.0, 1.22]]); + + { + let mut m = m.clone(); + m.scale_mut(&m.mean(0), &m.std(0), 0); + assert!(m.approximate_eq(&expected_0, std::f32::EPSILON)); + } + + m.scale_mut(&m.mean(1), &m.std(1), 1); + assert!(m.approximate_eq(&expected_1, 1e-2)); + } +} diff --git a/src/linear/mod.rs b/src/linear/mod.rs index 54bbca0..fef7070 100644 --- a/src/linear/mod.rs +++ b/src/linear/mod.rs @@ -22,3 +22,4 @@ pub mod linear_regression; pub mod logistic_regression; +pub mod ridge_regression; diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs new file mode 100644 index 0000000..18df6cb --- /dev/null +++ b/src/linear/ridge_regression.rs @@ -0,0 +1,323 @@ +//! # Ridge Regression +//! +//! [Linear regression](../linear_regression/index.html) is the standard algorithm for predicting a quantitative response \\(y\\) on the basis of a linear combination of explanatory variables \\(X\\) +//! that assumes that there is approximately a linear relationship between \\(X\\) and \\(y\\). +//! Ridge regression is an extension to linear regression that adds L2 regularization term to the loss function during training. +//! This term encourages simpler models that have smaller coefficient values. +//! +//! In ridge regression coefficients \\(\beta_0, \beta_0, ... \beta_n\\) are are estimated by solving +//! +//! \\[\hat{\beta} = (X^TX + \alpha I)^{-1}X^Ty \\] +//! +//! where \\(\alpha \geq 0\\) is a tuning parameter that controls strength of regularization. When \\(\alpha = 0\\) the penalty term has no effect, and ridge regression will produce the least squares estimates. +//! However, as \\(\alpha \rightarrow \infty\\), the impact of the shrinkage penalty grows, and the ridge regression coefficient estimates will approach zero. +//! +//! SmartCore uses [SVD](../../linalg/svd/index.html) and [Cholesky](../../linalg/cholesky/index.html) matrix decomposition to find estimates of \\(\hat{\beta}\\). +//! The Cholesky decomposition is more computationally efficient and more numerically stable than calculating the normal equation directly, +//! but does not work for all data matrices. Unlike the Cholesky decomposition, all matrices have an SVD decomposition. +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::linear::ridge_regression::*; +//! +//! // Longley dataset (https://www.statsmodels.org/stable/datasets/generated/longley.html) +//! let x = DenseMatrix::from_2d_array(&[ +//! &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], +//! &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], +//! &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], +//! &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], +//! &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], +//! &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], +//! &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], +//! &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], +//! &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], +//! &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], +//! &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], +//! &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], +//! &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], +//! &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], +//! &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], +//! &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], +//! ]); +//! +//! let y: Vec = vec![83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, +//! 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, 114.2, 115.7, 116.9]; +//! +//! let y_hat = RidgeRegression::fit(&x, &y, RidgeRegressionParameters { +//! solver: RidgeRegressionSolverName::Cholesky, +//! alpha: 0.1, +//! normalize: true +//! }).and_then(|lr| lr.predict(&x)).unwrap(); +//! ``` +//! +//! ## References: +//! +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 6.2. Shrinkage Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["Numerical Recipes: The Art of Scientific Computing", Press W.H., Teukolsky S.A., Vetterling W.T, Flannery B.P, 3rd ed., Section 15.4 General Linear Least Squares](http://numerical.recipes/) +//! +//! +//! +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::BaseVector; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +#[derive(Serialize, Deserialize, Debug)] +/// Approach to use for estimation of regression coefficients. Cholesky is more efficient but SVD is more stable. +pub enum RidgeRegressionSolverName { + /// Cholesky decomposition, see [Cholesky](../../linalg/cholesky/index.html) + Cholesky, + /// SVD decomposition, see [SVD](../../linalg/svd/index.html) + SVD, +} + +/// Ridge Regression parameters +#[derive(Serialize, Deserialize, Debug)] +pub struct RidgeRegressionParameters { + /// Solver to use for estimation of regression coefficients. + pub solver: RidgeRegressionSolverName, + /// Controls the strength of the penalty to the loss function. + pub alpha: T, + /// If true the regressors X will be normalized before regression + /// by subtracting the mean and dividing by the standard deviation. + pub normalize: bool, +} + +/// Ridge regression +#[derive(Serialize, Deserialize, Debug)] +pub struct RidgeRegression> { + coefficients: M, + intercept: T, + solver: RidgeRegressionSolverName, +} + +impl Default for RidgeRegressionParameters { + fn default() -> Self { + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::Cholesky, + alpha: T::one(), + normalize: true, + } + } +} + +impl> PartialEq for RidgeRegression { + fn eq(&self, other: &Self) -> bool { + self.coefficients == other.coefficients + && (self.intercept - other.intercept).abs() <= T::epsilon() + } +} + +impl> RidgeRegression { + /// Fits ridge regression to your data. + /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `y` - target values + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit( + x: &M, + y: &M::RowVector, + parameters: RidgeRegressionParameters, + ) -> Result, Failed> { + //w = inv(X^t X + alpha*Id) * X.T y + + let (n, p) = x.shape(); + + if n <= p { + return Err(Failed::fit(&format!( + "Number of rows in X should be >= number of columns in X" + ))); + } + + let y_column = M::from_row_vector(y.clone()).transpose(); + + let (w, b) = if parameters.normalize { + let (scaled_x, col_mean, col_std) = Self::rescale_x(x)?; + let x_t = scaled_x.transpose(); + let x_t_y = x_t.matmul(&y_column); + let mut x_t_x = x_t.matmul(&scaled_x); + + for i in 0..p { + x_t_x.add_element_mut(i, i, parameters.alpha); + } + + let mut w = match parameters.solver { + RidgeRegressionSolverName::Cholesky => x_t_x.cholesky_solve_mut(x_t_y)?, + RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, + }; + + for i in 0..p { + w.set(i, 0, w.get(i, 0) / col_std[i]); + } + + let mut b = T::zero(); + + for i in 0..p { + b += w.get(i, 0) * col_mean[i]; + } + + let b = y.mean() - b; + + (w, b) + } else { + let x_t = x.transpose(); + let x_t_y = x_t.matmul(&y_column); + let mut x_t_x = x_t.matmul(x); + + for i in 0..p { + x_t_x.add_element_mut(i, i, parameters.alpha); + } + + let w = match parameters.solver { + RidgeRegressionSolverName::Cholesky => x_t_x.cholesky_solve_mut(x_t_y)?, + RidgeRegressionSolverName::SVD => x_t_x.svd_solve_mut(x_t_y)?, + }; + + (w, T::zero()) + }; + + Ok(RidgeRegression { + intercept: b, + coefficients: w, + solver: parameters.solver, + }) + } + + fn rescale_x(x: &M) -> Result<(M, Vec, Vec), Failed> { + let col_mean = x.mean(0); + let col_std = x.std(0); + + for i in 0..col_std.len() { + if (col_std[i] - T::zero()).abs() < T::epsilon() { + return Err(Failed::fit(&format!( + "Cannot rescale constant column {}", + i + ))); + } + } + + let mut scaled_x = x.clone(); + scaled_x.scale_mut(&col_mean, &col_std, 0); + Ok((scaled_x, col_mean, col_std)) + } + + /// Predict target values from `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn predict(&self, x: &M) -> Result { + let (nrows, _) = x.shape(); + let mut y_hat = x.matmul(&self.coefficients); + y_hat.add_mut(&M::fill(nrows, 1, self.intercept)); + Ok(y_hat.transpose().to_row_vector()) + } + + /// Get estimates regression coefficients + pub fn coefficients(&self) -> M { + self.coefficients.clone() + } + + /// Get estimate of intercept + pub fn intercept(&self) -> T { + self.intercept + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + use crate::metrics::mean_absolute_error; + + #[test] + fn ridge_fit_predict() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y: Vec = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let y_hat_cholesky = RidgeRegression::fit( + &x, + &y, + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::Cholesky, + alpha: 0.1, + normalize: true, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat_cholesky, &y) < 2.0); + + let y_hat_svd = RidgeRegression::fit( + &x, + &y, + RidgeRegressionParameters { + solver: RidgeRegressionSolverName::SVD, + alpha: 0.1, + normalize: false, + }, + ) + .and_then(|lr| lr.predict(&x)) + .unwrap(); + + assert!(mean_absolute_error(&y_hat_svd, &y) < 2.0); + } + + #[test] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + &[258.054, 368.2, 161.6, 109.773, 1949., 60.171], + &[284.599, 335.1, 165.0, 110.929, 1950., 61.187], + &[328.975, 209.9, 309.9, 112.075, 1951., 63.221], + &[346.999, 193.2, 359.4, 113.270, 1952., 63.639], + &[365.385, 187.0, 354.7, 115.094, 1953., 64.989], + &[363.112, 357.8, 335.0, 116.219, 1954., 63.761], + &[397.469, 290.4, 304.8, 117.388, 1955., 66.019], + &[419.180, 282.2, 285.7, 118.734, 1956., 67.857], + &[442.769, 293.6, 279.8, 120.445, 1957., 68.169], + &[444.546, 468.1, 263.7, 121.950, 1958., 66.513], + &[482.704, 381.3, 255.2, 123.366, 1959., 68.655], + &[502.601, 393.1, 251.4, 125.368, 1960., 69.564], + &[518.173, 480.6, 257.2, 127.852, 1961., 69.331], + &[554.894, 400.7, 282.7, 130.081, 1962., 70.551], + ]); + + let y = vec![ + 83.0, 88.5, 88.2, 89.5, 96.2, 98.1, 99.0, 100.0, 101.2, 104.6, 108.4, 110.8, 112.6, + 114.2, 115.7, 116.9, + ]; + + let lr = RidgeRegression::fit(&x, &y, Default::default()).unwrap(); + + let deserialized_lr: RidgeRegression> = + serde_json::from_str(&serde_json::to_string(&lr).unwrap()).unwrap(); + + assert_eq!(lr, deserialized_lr); + } +}