From 6d313821fa55dfb6a93062642275272d76ed0088 Mon Sep 17 00:00:00 2001 From: Volodymyr Orlov Date: Wed, 2 Sep 2020 10:14:34 -0700 Subject: [PATCH] feat: documents pca --- src/cluster/kmeans.rs | 2 +- src/decomposition/mod.rs | 14 ++++++++++ src/decomposition/pca.rs | 58 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs index 201918e..021be83 100644 --- a/src/cluster/kmeans.rs +++ b/src/cluster/kmeans.rs @@ -49,7 +49,7 @@ //! //! ## References: //! -//! * ["An Introduction to Statistical Learning", James et al., 10.3.1 K-Means Clustering](http://faculty.marshall.usc.edu/gareth-james/ISL/) +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 10.3.1 K-Means Clustering](http://faculty.marshall.usc.edu/gareth-james/ISL/) //! * ["k-means++: The Advantages of Careful Seeding", Arthur D., Vassilvitskii S.](http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf) extern crate rand; diff --git a/src/decomposition/mod.rs b/src/decomposition/mod.rs index a6bac38..bdbd3e9 100644 --- a/src/decomposition/mod.rs +++ b/src/decomposition/mod.rs @@ -1 +1,15 @@ +//! # Dimension Reduction Methods +//! Dimension reduction is a popular approach for deriving a low-dimensional set of features from a large set of variables. +//! +//! High Dimensional Data (a lot of input features) often degrade performance of machine learning algorithms due to [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality). +//! The more dimensions you have in a data set, the more difficult it becomes to predict certain quantities. While it seems that the more explanatory variables the better, +//! when it comes to adding variables, the opposite is true. Each added variable results in an exponential decrease in predictive power. +//! Therefore, it is often desirable to reduce the number of input features. +//! +//! Dimension reduction is also used for the purposes of data visualization. +//! +//! ## References +//! * ["An Introduction to Statistical Learning", James G., Witten D., Hastie T., Tibshirani R., 10.3.1 K-Means Clustering, 6.3 Dimension Reduction Methods](http://faculty.marshall.usc.edu/gareth-james/ISL/) + +/// PCA is a popular approach for deriving a low-dimensional set of features from a large set of variables. pub mod pca; diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs index e1f7027..b565ce4 100644 --- a/src/decomposition/pca.rs +++ b/src/decomposition/pca.rs @@ -1,3 +1,49 @@ +//! # PCA +//! +//! Principal components analysis (PCA) is a method that is used to select several linear combinations that capture most of the variation in your data. +//! PCA is an unsupervised approach, since it involves only a set of features \\(X1, X2, . . . , Xn\\), and no associated response \\(Y\\). +//! Apart from producing derived variables for use in supervised learning problems, PCA also serves as a tool for data visualization. +//! +//! PCA is scale sensitive. Before PCA is performed, the variables should be centered to have mean zero. +//! Furthermore, the results obtained also depend on whether the variables have been individually scaled. +//! Use `use_correlation_matrix` parameter to standardize your variables (to mean 0 and standard deviation 1). +//! +//! Example: +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::decomposition::pca::*; +//! +//! // Iris data +//! let iris = DenseMatrix::from_array(&[ +//! &[5.1, 3.5, 1.4, 0.2], +//! &[4.9, 3.0, 1.4, 0.2], +//! &[4.7, 3.2, 1.3, 0.2], +//! &[4.6, 3.1, 1.5, 0.2], +//! &[5.0, 3.6, 1.4, 0.2], +//! &[5.4, 3.9, 1.7, 0.4], +//! &[4.6, 3.4, 1.4, 0.3], +//! &[5.0, 3.4, 1.5, 0.2], +//! &[4.4, 2.9, 1.4, 0.2], +//! &[4.9, 3.1, 1.5, 0.1], +//! &[7.0, 3.2, 4.7, 1.4], +//! &[6.4, 3.2, 4.5, 1.5], +//! &[6.9, 3.1, 4.9, 1.5], +//! &[5.5, 2.3, 4.0, 1.3], +//! &[6.5, 2.8, 4.6, 1.5], +//! &[5.7, 2.8, 4.5, 1.3], +//! &[6.3, 3.3, 4.7, 1.6], +//! &[4.9, 2.4, 3.3, 1.0], +//! &[6.6, 2.9, 4.6, 1.3], +//! &[5.2, 2.7, 3.9, 1.4], +//! ]); +//! +//! let pca = PCA::new(&iris, 2, Default::default()); // Reduce number of features to 2 +//! +//! let iris_reduced = pca.transform(&iris); +//! +//! ``` +//! +//! use std::fmt::Debug; use serde::{Deserialize, Serialize}; @@ -5,6 +51,7 @@ use serde::{Deserialize, Serialize}; use crate::linalg::Matrix; use crate::math::num::RealNumber; +/// Principal components analysis algorithm #[derive(Serialize, Deserialize, Debug)] pub struct PCA> { eigenvectors: M, @@ -32,8 +79,11 @@ impl> PartialEq for PCA { } #[derive(Debug, Clone)] +/// PCA parameters pub struct PCAParameters { - use_correlation_matrix: bool, + /// By default, covariance matrix is used to compute principal components. + /// Enable this flag if you want to use correlation matrix instead. + pub use_correlation_matrix: bool, } impl Default for PCAParameters { @@ -45,6 +95,10 @@ impl Default for PCAParameters { } impl> PCA { + /// Fits PCA to your data. + /// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation. + /// * `n_components` - number of components to keep. + /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values. pub fn new(data: &M, n_components: usize, parameters: PCAParameters) -> PCA { let (m, n) = data.shape(); @@ -143,6 +197,8 @@ impl> PCA { } } + /// Run dimensionality reduction for `x` + /// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features. pub fn transform(&self, x: &M) -> M { let (nrows, ncols) = x.shape(); let (_, n_components) = self.projection.shape();