From 74a7c45c75313045c0547ef5271e604c5995fb89 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Mon, 14 Dec 2020 14:59:02 -0800 Subject: [PATCH] feat: adds SVD --- src/decomposition/mod.rs | 1 + src/decomposition/pca.rs | 28 +++++ src/decomposition/svd.rs | 235 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 src/decomposition/svd.rs diff --git a/src/decomposition/mod.rs b/src/decomposition/mod.rs index a01c114..1460bd6 100644 --- a/src/decomposition/mod.rs +++ b/src/decomposition/mod.rs @@ -13,3 +13,4 @@ /// PCA is a popular approach for deriving a low-dimensional set of features from a large set of variables. pub mod pca; +pub mod svd; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index 9f5bd39..7d80f88 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -108,6 +108,13 @@ impl> PCA { ) -> Result, Failed> { let (m, n) = data.shape(); + if n_components > n { + return Err(Failed::fit(&format!( + "Number of components, n_components should be <= number of attributes ({})", + n + ))); + } + let mu = data.column_mean(); let mut x = data.clone(); @@ -224,6 +231,11 @@ impl> PCA { } Ok(x_transformed) } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.projection + } } #[cfg(test)] @@ -286,6 +298,22 @@ mod tests { ]) } + #[test] + fn pca_components() { + let us_arrests = us_arrests_data(); + + let expected = DenseMatrix::from_2d_array(&[ + &[0.0417, 0.0448], + &[0.9952, 0.0588], + &[0.0463, 0.9769], + &[0.0752, 0.2007], + ]); + + let pca = PCA::fit(&us_arrests, 2, Default::default()).unwrap(); + + assert!(expected.approximate_eq(&pca.components().abs(), 0.4)); + } + #[test] fn decompose_covariance() { let us_arrests = us_arrests_data(); diff --git a/src/decomposition/svd.rs b/src/decomposition/svd.rs new file mode 100644 index 0000000..fbaf042 --- /dev/null +++ b/src/decomposition/svd.rs @@ -0,0 +1,235 @@ +//! # Dimensionality reduction using SVD +//! +//! Similar to [`PCA`](../pca/index.html), SVD is a technique that can be used to reduce the number of input variables _p_ to a smaller number _k_, while preserving +//! the most important structure or relationships between the variables observed in the data. +//! +//! Contrary to PCA, SVD does not center the data before computing the singular value decomposition. +//! +//! Example: +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::decomposition::svd::*; +//! +//! // Iris data +//! let iris = DenseMatrix::from_2d_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! +//! let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); // Reduce number of features to 2 +//! +//! let iris_reduced = svd.transform(&iris).unwrap(); +//! +//! ``` +//! +//! +//! +use std::fmt::Debug; +use std::marker::PhantomData; + +use serde::{Deserialize, Serialize}; + +use crate::error::Failed; +use crate::linalg::Matrix; +use crate::math::num::RealNumber; + +/// SVD +#[derive(Serialize, Deserialize, Debug)] +pub struct SVD> { + components: M, + phantom: PhantomData, +} + +impl> PartialEq for SVD { + fn eq(&self, other: &Self) -> bool { + self.components + .approximate_eq(&other.components, T::from_f64(1e-8).unwrap()) + } +} + +#[derive(Debug, Clone)] +/// SVD parameters +pub struct SVDParameters {} + +impl Default for SVDParameters { + fn default() -> Self { + SVDParameters {} + } +} + +impl> SVD { + /// Fits SVD to your data. + /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `n_components` - number of components to keep. + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. + pub fn fit(x: &M, n_components: usize, _: SVDParameters) -> Result, Failed> { + let (_, p) = x.shape(); + + if n_components >= p { + return Err(Failed::fit(&format!( + "Number of components, n_components should be < number of attributes ({})", + p + ))); + } + + let svd = x.svd()?; + + let components = svd.V.slice(0..p, 0..n_components); + + Ok(SVD { + components, + phantom: PhantomData, + }) + } + + /// Run dimensionality reduction for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. + pub fn transform(&self, x: &M) -> Result { + let (n, p) = x.shape(); + let (p_c, k) = self.components.shape(); + if p_c != p { + return Err(Failed::transform(&format!( + "Can not transform a {}x{} matrix into {}x{} matrix, incorrect input dimentions", + n, p, n, k + ))); + } + + Ok(x.matmul(&self.components)) + } + + /// Get a projection matrix + pub fn components(&self) -> &M { + &self.components + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::*; + + #[test] + fn svd_decompose() { + // https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/USArrests.html + let x = DenseMatrix::from_2d_array(&[ + &[13.2, 236.0, 58.0, 21.2], + &[10.0, 263.0, 48.0, 44.5], + &[8.1, 294.0, 80.0, 31.0], + &[8.8, 190.0, 50.0, 19.5], + &[9.0, 276.0, 91.0, 40.6], + &[7.9, 204.0, 78.0, 38.7], + &[3.3, 110.0, 77.0, 11.1], + &[5.9, 238.0, 72.0, 15.8], + &[15.4, 335.0, 80.0, 31.9], + &[17.4, 211.0, 60.0, 25.8], + &[5.3, 46.0, 83.0, 20.2], + &[2.6, 120.0, 54.0, 14.2], + &[10.4, 249.0, 83.0, 24.0], + &[7.2, 113.0, 65.0, 21.0], + &[2.2, 56.0, 57.0, 11.3], + &[6.0, 115.0, 66.0, 18.0], + &[9.7, 109.0, 52.0, 16.3], + &[15.4, 249.0, 66.0, 22.2], + &[2.1, 83.0, 51.0, 7.8], + &[11.3, 300.0, 67.0, 27.8], + &[4.4, 149.0, 85.0, 16.3], + &[12.1, 255.0, 74.0, 35.1], + &[2.7, 72.0, 66.0, 14.9], + &[16.1, 259.0, 44.0, 17.1], + &[9.0, 178.0, 70.0, 28.2], + &[6.0, 109.0, 53.0, 16.4], + &[4.3, 102.0, 62.0, 16.5], + &[12.2, 252.0, 81.0, 46.0], + &[2.1, 57.0, 56.0, 9.5], + &[7.4, 159.0, 89.0, 18.8], + &[11.4, 285.0, 70.0, 32.1], + &[11.1, 254.0, 86.0, 26.1], + &[13.0, 337.0, 45.0, 16.1], + &[0.8, 45.0, 44.0, 7.3], + &[7.3, 120.0, 75.0, 21.4], + &[6.6, 151.0, 68.0, 20.0], + &[4.9, 159.0, 67.0, 29.3], + &[6.3, 106.0, 72.0, 14.9], + &[3.4, 174.0, 87.0, 8.3], + &[14.4, 279.0, 48.0, 22.5], + &[3.8, 86.0, 45.0, 12.8], + &[13.2, 188.0, 59.0, 26.9], + &[12.7, 201.0, 80.0, 25.5], + &[3.2, 120.0, 80.0, 22.9], + &[2.2, 48.0, 32.0, 11.2], + &[8.5, 156.0, 63.0, 20.7], + &[4.0, 145.0, 73.0, 26.2], + &[5.7, 81.0, 39.0, 9.3], + &[2.6, 53.0, 66.0, 10.8], + &[6.8, 161.0, 60.0, 15.6], + ]); + + let expected = DenseMatrix::from_2d_array(&[ + &[243.54655757, -18.76673788], + &[268.36802004, -33.79304302], + &[305.93972467, -15.39087376], + &[197.28420365, -11.66808306], + &[293.43187394, 1.91163633], + ]); + let svd = SVD::fit(&x, 2, Default::default()).unwrap(); + + let x_transformed = svd.transform(&x).unwrap(); + + assert_eq!(svd.components.shape(), (x.shape().1, 2)); + + assert!(x_transformed + .slice(0..5, 0..2) + .approximate_eq(&expected, 1e-4)); + } + + #[test] + fn serde() { + let iris = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let svd = SVD::fit(&iris, 2, Default::default()).unwrap(); + + let deserialized_svd: SVD> = + serde_json::from_str(&serde_json::to_string(&svd).unwrap()).unwrap(); + + assert_eq!(svd, deserialized_svd); + } +}