Merge potential next release v0.4 (#187) Breaking Changes
* First draft of the new n-dimensional arrays + NB use case * Improves default implementation of multiple Array methods * Refactors tree methods * Adds matrix decomposition routines * Adds matrix decomposition methods to ndarray and nalgebra bindings * Refactoring + linear regression now uses array2 * Ridge & Linear regression * LBFGS optimizer & logistic regression * LBFGS optimizer & logistic regression * Changes linear methods, metrics and model selection methods to new n-dimensional arrays * Switches KNN and clustering algorithms to new n-d array layer * Refactors distance metrics * Optimizes knn and clustering methods * Refactors metrics module * Switches decomposition methods to n-dimensional arrays * Linalg refactoring - cleanup rng merge (#172) * Remove legacy DenseMatrix and BaseMatrix implementation. Port the new Number, FloatNumber and Array implementation into module structure. * Exclude AUC metrics. Needs reimplementation * Improve developers walkthrough New traits system in place at `src/numbers` and `src/linalg` Co-authored-by: Lorenzo <tunedconsulting@gmail.com> * Provide SupervisedEstimator with a constructor to avoid explicit dynamical box allocation in 'cross_validate' and 'cross_validate_predict' as required by the use of 'dyn' as per Rust 2021 * Implement getters to use as_ref() in src/neighbors * Implement getters to use as_ref() in src/naive_bayes * Implement getters to use as_ref() in src/linear * Add Clone to src/naive_bayes * Change signature for cross_validate and other model_selection functions to abide to use of dyn in Rust 2021 * Implement ndarray-bindings. Remove FloatNumber from implementations * Drop nalgebra-bindings support (as decided in conf-call to go for ndarray) * Remove benches. Benches will have their own repo at smartcore-benches * Implement SVC * Implement SVC serialization. Move search parameters in dedicated module * Implement SVR. Definitely too slow * Fix compilation issues for wasm (#202) Co-authored-by: Luis Moreno <morenol@users.noreply.github.com> * Fix tests (#203) * Port linalg/traits/stats.rs * Improve methods naming * Improve Display for DenseMatrix Co-authored-by: Montana Low <montanalow@users.noreply.github.com> Co-authored-by: VolodymyrOrlov <volodymyr.orlov@gmail.com>
This commit is contained in:
+109
-79
@@ -10,7 +10,7 @@
|
||||
//!
|
||||
//! Example:
|
||||
//! ```
|
||||
//! use smartcore::linalg::naive::dense_matrix::*;
|
||||
//! use smartcore::linalg::basic::matrix::DenseMatrix;
|
||||
//! use smartcore::decomposition::pca::*;
|
||||
//!
|
||||
//! // Iris data
|
||||
@@ -52,24 +52,33 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::api::{Transformer, UnsupervisedEstimator};
|
||||
use crate::error::Failed;
|
||||
use crate::linalg::Matrix;
|
||||
use crate::math::num::RealNumber;
|
||||
use crate::linalg::basic::arrays::Array2;
|
||||
use crate::linalg::traits::evd::EVDDecomposable;
|
||||
use crate::linalg::traits::svd::SVDDecomposable;
|
||||
use crate::numbers::basenum::Number;
|
||||
use crate::numbers::realnum::RealNumber;
|
||||
|
||||
/// Principal components analysis algorithm
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[derive(Debug)]
|
||||
pub struct PCA<T: RealNumber, M: Matrix<T>> {
|
||||
eigenvectors: M,
|
||||
pub struct PCA<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> {
|
||||
eigenvectors: X,
|
||||
eigenvalues: Vec<T>,
|
||||
projection: M,
|
||||
projection: X,
|
||||
mu: Vec<T>,
|
||||
pmu: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> PartialEq for PCA<T, M> {
|
||||
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> PartialEq
|
||||
for PCA<T, X>
|
||||
{
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
if self.eigenvectors != other.eigenvectors
|
||||
|| self.eigenvalues.len() != other.eigenvalues.len()
|
||||
if self.eigenvalues.len() != other.eigenvalues.len()
|
||||
|| self
|
||||
.eigenvectors
|
||||
.iterator(0)
|
||||
.zip(other.eigenvectors.iterator(0))
|
||||
.any(|(&a, &b)| (a - b).abs() > T::epsilon())
|
||||
{
|
||||
false
|
||||
} else {
|
||||
@@ -196,24 +205,28 @@ impl Default for PCASearchParameters {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> UnsupervisedEstimator<M, PCAParameters> for PCA<T, M> {
|
||||
fn fit(x: &M, parameters: PCAParameters) -> Result<Self, Failed> {
|
||||
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>>
|
||||
UnsupervisedEstimator<X, PCAParameters> for PCA<T, X>
|
||||
{
|
||||
fn fit(x: &X, parameters: PCAParameters) -> Result<Self, Failed> {
|
||||
PCA::fit(x, parameters)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> Transformer<M> for PCA<T, M> {
|
||||
fn transform(&self, x: &M) -> Result<M, Failed> {
|
||||
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> Transformer<X>
|
||||
for PCA<T, X>
|
||||
{
|
||||
fn transform(&self, x: &X) -> Result<X, Failed> {
|
||||
self.transform(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
impl<T: Number + RealNumber, X: Array2<T> + SVDDecomposable<T> + EVDDecomposable<T>> PCA<T, X> {
|
||||
/// Fits PCA to your data.
|
||||
/// * `data` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
|
||||
/// * `n_components` - number of components to keep.
|
||||
/// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.
|
||||
pub fn fit(data: &M, parameters: PCAParameters) -> Result<PCA<T, M>, Failed> {
|
||||
pub fn fit(data: &X, parameters: PCAParameters) -> Result<PCA<T, X>, Failed> {
|
||||
let (m, n) = data.shape();
|
||||
|
||||
if parameters.n_components > n {
|
||||
@@ -223,13 +236,17 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
)));
|
||||
}
|
||||
|
||||
let mu = data.column_mean();
|
||||
let mu: Vec<T> = data
|
||||
.mean_by(0)
|
||||
.iter()
|
||||
.map(|&v| T::from_f64(v).unwrap())
|
||||
.collect();
|
||||
|
||||
let mut x = data.clone();
|
||||
|
||||
for (c, mu_c) in mu.iter().enumerate().take(n) {
|
||||
for (c, &mu_c) in mu.iter().enumerate().take(n) {
|
||||
for r in 0..m {
|
||||
x.sub_element_mut(r, c, *mu_c);
|
||||
x.sub_element_mut((r, c), mu_c);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,33 +262,33 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
|
||||
eigenvectors = svd.V;
|
||||
} else {
|
||||
let mut cov = M::zeros(n, n);
|
||||
let mut cov = X::zeros(n, n);
|
||||
|
||||
for k in 0..m {
|
||||
for i in 0..n {
|
||||
for j in 0..=i {
|
||||
cov.add_element_mut(i, j, x.get(k, i) * x.get(k, j));
|
||||
cov.add_element_mut((i, j), *x.get((k, i)) * *x.get((k, j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..n {
|
||||
for j in 0..=i {
|
||||
cov.div_element_mut(i, j, T::from(m).unwrap());
|
||||
cov.set(j, i, cov.get(i, j));
|
||||
cov.div_element_mut((i, j), T::from(m).unwrap());
|
||||
cov.set((j, i), *cov.get((i, j)));
|
||||
}
|
||||
}
|
||||
|
||||
if parameters.use_correlation_matrix {
|
||||
let mut sd = vec![T::zero(); n];
|
||||
for (i, sd_i) in sd.iter_mut().enumerate().take(n) {
|
||||
*sd_i = cov.get(i, i).sqrt();
|
||||
*sd_i = cov.get((i, i)).sqrt();
|
||||
}
|
||||
|
||||
for i in 0..n {
|
||||
for j in 0..=i {
|
||||
cov.div_element_mut(i, j, sd[i] * sd[j]);
|
||||
cov.set(j, i, cov.get(i, j));
|
||||
cov.div_element_mut((i, j), sd[i] * sd[j]);
|
||||
cov.set((j, i), *cov.get((i, j)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,7 +300,7 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
|
||||
for (i, sd_i) in sd.iter().enumerate().take(n) {
|
||||
for j in 0..n {
|
||||
eigenvectors.div_element_mut(i, j, *sd_i);
|
||||
eigenvectors.div_element_mut((i, j), *sd_i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -295,17 +312,17 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
}
|
||||
}
|
||||
|
||||
let mut projection = M::zeros(parameters.n_components, n);
|
||||
let mut projection = X::zeros(parameters.n_components, n);
|
||||
for i in 0..n {
|
||||
for j in 0..parameters.n_components {
|
||||
projection.set(j, i, eigenvectors.get(i, j));
|
||||
projection.set((j, i), *eigenvectors.get((i, j)));
|
||||
}
|
||||
}
|
||||
|
||||
let mut pmu = vec![T::zero(); parameters.n_components];
|
||||
for (k, mu_k) in mu.iter().enumerate().take(n) {
|
||||
for (i, pmu_i) in pmu.iter_mut().enumerate().take(parameters.n_components) {
|
||||
*pmu_i += projection.get(i, k) * (*mu_k);
|
||||
*pmu_i += *projection.get((i, k)) * (*mu_k);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,7 +337,7 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
|
||||
/// Run dimensionality reduction for `x`
|
||||
/// * `x` - _KxM_ data where _K_ is number of observations and _M_ is number of features.
|
||||
pub fn transform(&self, x: &M) -> Result<M, Failed> {
|
||||
pub fn transform(&self, x: &X) -> Result<X, Failed> {
|
||||
let (nrows, ncols) = x.shape();
|
||||
let (_, n_components) = self.projection.shape();
|
||||
if ncols != self.mu.len() {
|
||||
@@ -334,14 +351,14 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
let mut x_transformed = x.matmul(&self.projection);
|
||||
for r in 0..nrows {
|
||||
for c in 0..n_components {
|
||||
x_transformed.sub_element_mut(r, c, self.pmu[c]);
|
||||
x_transformed.sub_element_mut((r, c), self.pmu[c]);
|
||||
}
|
||||
}
|
||||
Ok(x_transformed)
|
||||
}
|
||||
|
||||
/// Get a projection matrix
|
||||
pub fn components(&self) -> &M {
|
||||
pub fn components(&self) -> &X {
|
||||
&self.projection
|
||||
}
|
||||
}
|
||||
@@ -349,7 +366,8 @@ impl<T: RealNumber, M: Matrix<T>> PCA<T, M> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::linalg::naive::dense_matrix::*;
|
||||
use crate::linalg::basic::matrix::DenseMatrix;
|
||||
use approx::relative_eq;
|
||||
|
||||
#[test]
|
||||
fn search_parameters() {
|
||||
@@ -442,7 +460,11 @@ mod tests {
|
||||
|
||||
let pca = PCA::fit(&us_arrests, Default::default()).unwrap();
|
||||
|
||||
assert!(expected.approximate_eq(&pca.components().abs(), 0.4));
|
||||
assert!(relative_eq!(
|
||||
expected,
|
||||
pca.components().abs(),
|
||||
epsilon = 1e-3
|
||||
));
|
||||
}
|
||||
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
|
||||
#[test]
|
||||
@@ -538,10 +560,11 @@ mod tests {
|
||||
|
||||
let pca = PCA::fit(&us_arrests, PCAParameters::default().with_n_components(4)).unwrap();
|
||||
|
||||
assert!(pca
|
||||
.eigenvectors
|
||||
.abs()
|
||||
.approximate_eq(&expected_eigenvectors.abs(), 1e-4));
|
||||
assert!(relative_eq!(
|
||||
pca.eigenvectors.abs(),
|
||||
&expected_eigenvectors.abs(),
|
||||
epsilon = 1e-4
|
||||
));
|
||||
|
||||
for i in 0..pca.eigenvalues.len() {
|
||||
assert!((pca.eigenvalues[i].abs() - expected_eigenvalues[i].abs()).abs() < 1e-8);
|
||||
@@ -549,9 +572,11 @@ mod tests {
|
||||
|
||||
let us_arrests_t = pca.transform(&us_arrests).unwrap();
|
||||
|
||||
assert!(us_arrests_t
|
||||
.abs()
|
||||
.approximate_eq(&expected_projection.abs(), 1e-4));
|
||||
assert!(relative_eq!(
|
||||
us_arrests_t.abs(),
|
||||
&expected_projection.abs(),
|
||||
epsilon = 1e-4
|
||||
));
|
||||
}
|
||||
|
||||
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
|
||||
@@ -654,10 +679,11 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert!(pca
|
||||
.eigenvectors
|
||||
.abs()
|
||||
.approximate_eq(&expected_eigenvectors.abs(), 1e-4));
|
||||
assert!(relative_eq!(
|
||||
pca.eigenvectors.abs(),
|
||||
&expected_eigenvectors.abs(),
|
||||
epsilon = 1e-4
|
||||
));
|
||||
|
||||
for i in 0..pca.eigenvalues.len() {
|
||||
assert!((pca.eigenvalues[i].abs() - expected_eigenvalues[i].abs()).abs() < 1e-8);
|
||||
@@ -665,43 +691,47 @@ mod tests {
|
||||
|
||||
let us_arrests_t = pca.transform(&us_arrests).unwrap();
|
||||
|
||||
assert!(us_arrests_t
|
||||
.abs()
|
||||
.approximate_eq(&expected_projection.abs(), 1e-4));
|
||||
assert!(relative_eq!(
|
||||
us_arrests_t.abs(),
|
||||
&expected_projection.abs(),
|
||||
epsilon = 1e-4
|
||||
));
|
||||
}
|
||||
|
||||
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
|
||||
#[test]
|
||||
#[cfg(feature = "serde")]
|
||||
fn serde() {
|
||||
let iris = DenseMatrix::from_2d_array(&[
|
||||
&[5.1, 3.5, 1.4, 0.2],
|
||||
&[4.9, 3.0, 1.4, 0.2],
|
||||
&[4.7, 3.2, 1.3, 0.2],
|
||||
&[4.6, 3.1, 1.5, 0.2],
|
||||
&[5.0, 3.6, 1.4, 0.2],
|
||||
&[5.4, 3.9, 1.7, 0.4],
|
||||
&[4.6, 3.4, 1.4, 0.3],
|
||||
&[5.0, 3.4, 1.5, 0.2],
|
||||
&[4.4, 2.9, 1.4, 0.2],
|
||||
&[4.9, 3.1, 1.5, 0.1],
|
||||
&[7.0, 3.2, 4.7, 1.4],
|
||||
&[6.4, 3.2, 4.5, 1.5],
|
||||
&[6.9, 3.1, 4.9, 1.5],
|
||||
&[5.5, 2.3, 4.0, 1.3],
|
||||
&[6.5, 2.8, 4.6, 1.5],
|
||||
&[5.7, 2.8, 4.5, 1.3],
|
||||
&[6.3, 3.3, 4.7, 1.6],
|
||||
&[4.9, 2.4, 3.3, 1.0],
|
||||
&[6.6, 2.9, 4.6, 1.3],
|
||||
&[5.2, 2.7, 3.9, 1.4],
|
||||
]);
|
||||
// Disable this test for now
|
||||
// TODO: implement deserialization for new DenseMatrix
|
||||
// #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
|
||||
// #[test]
|
||||
// #[cfg(feature = "serde")]
|
||||
// fn pca_serde() {
|
||||
// let iris = DenseMatrix::from_2d_array(&[
|
||||
// &[5.1, 3.5, 1.4, 0.2],
|
||||
// &[4.9, 3.0, 1.4, 0.2],
|
||||
// &[4.7, 3.2, 1.3, 0.2],
|
||||
// &[4.6, 3.1, 1.5, 0.2],
|
||||
// &[5.0, 3.6, 1.4, 0.2],
|
||||
// &[5.4, 3.9, 1.7, 0.4],
|
||||
// &[4.6, 3.4, 1.4, 0.3],
|
||||
// &[5.0, 3.4, 1.5, 0.2],
|
||||
// &[4.4, 2.9, 1.4, 0.2],
|
||||
// &[4.9, 3.1, 1.5, 0.1],
|
||||
// &[7.0, 3.2, 4.7, 1.4],
|
||||
// &[6.4, 3.2, 4.5, 1.5],
|
||||
// &[6.9, 3.1, 4.9, 1.5],
|
||||
// &[5.5, 2.3, 4.0, 1.3],
|
||||
// &[6.5, 2.8, 4.6, 1.5],
|
||||
// &[5.7, 2.8, 4.5, 1.3],
|
||||
// &[6.3, 3.3, 4.7, 1.6],
|
||||
// &[4.9, 2.4, 3.3, 1.0],
|
||||
// &[6.6, 2.9, 4.6, 1.3],
|
||||
// &[5.2, 2.7, 3.9, 1.4],
|
||||
// ]);
|
||||
|
||||
let pca = PCA::fit(&iris, Default::default()).unwrap();
|
||||
// let pca = PCA::fit(&iris, Default::default()).unwrap();
|
||||
|
||||
let deserialized_pca: PCA<f64, DenseMatrix<f64>> =
|
||||
serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap();
|
||||
// let deserialized_pca: PCA<f64, DenseMatrix<f64>> =
|
||||
// serde_json::from_str(&serde_json::to_string(&pca).unwrap()).unwrap();
|
||||
|
||||
assert_eq!(pca, deserialized_pca);
|
||||
}
|
||||
// assert_eq!(pca, deserialized_pca);
|
||||
// }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user